Instructions to use eac123/sublim-phase4-combo-03 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/sublim-phase4-combo-03 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/sublim-phase4-combo-03") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.138745129108429, | |
| "epoch": 0.0037313432835820895, | |
| "grad_norm": 1.7020611763000488, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4721007347106934, | |
| "mean_token_accuracy": 0.5372578650712967, | |
| "num_tokens": 16325.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2262731790542603, | |
| "epoch": 0.007462686567164179, | |
| "grad_norm": 1.5422499179840088, | |
| "learning_rate": 0.0002, | |
| "loss": 2.1402571201324463, | |
| "mean_token_accuracy": 0.5742411762475967, | |
| "num_tokens": 32666.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.409499078989029, | |
| "epoch": 0.011194029850746268, | |
| "grad_norm": 1.1927348375320435, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7202329635620117, | |
| "mean_token_accuracy": 0.5956366509199142, | |
| "num_tokens": 48877.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3392578959465027, | |
| "epoch": 0.014925373134328358, | |
| "grad_norm": 0.9159098863601685, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3790637254714966, | |
| "mean_token_accuracy": 0.6494399756193161, | |
| "num_tokens": 65097.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.329741895198822, | |
| "epoch": 0.018656716417910446, | |
| "grad_norm": 0.9530413150787354, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2827703952789307, | |
| "mean_token_accuracy": 0.649653822183609, | |
| "num_tokens": 81423.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2239453792572021, | |
| "epoch": 0.022388059701492536, | |
| "grad_norm": 0.45381543040275574, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1552369594573975, | |
| "mean_token_accuracy": 0.6654698848724365, | |
| "num_tokens": 97674.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.1408285796642303, | |
| "epoch": 0.026119402985074626, | |
| "grad_norm": 0.40323638916015625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.063366413116455, | |
| "mean_token_accuracy": 0.6731287389993668, | |
| "num_tokens": 114207.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.0467455089092255, | |
| "epoch": 0.029850746268656716, | |
| "grad_norm": 0.4862216114997864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9919917583465576, | |
| "mean_token_accuracy": 0.6862917095422745, | |
| "num_tokens": 130364.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 0.9914536625146866, | |
| "epoch": 0.033582089552238806, | |
| "grad_norm": 0.563399612903595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9576236605644226, | |
| "mean_token_accuracy": 0.6916692554950714, | |
| "num_tokens": 146675.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9863343089818954, | |
| "epoch": 0.03731343283582089, | |
| "grad_norm": 0.4532151520252228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8861619234085083, | |
| "mean_token_accuracy": 0.7066572606563568, | |
| "num_tokens": 162793.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9439148157835007, | |
| "epoch": 0.041044776119402986, | |
| "grad_norm": 0.4917202889919281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8438840508460999, | |
| "mean_token_accuracy": 0.7115702927112579, | |
| "num_tokens": 178972.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.86412213742733, | |
| "epoch": 0.04477611940298507, | |
| "grad_norm": 0.4633786678314209, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8079400658607483, | |
| "mean_token_accuracy": 0.7117275148630142, | |
| "num_tokens": 195446.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.7569762617349625, | |
| "epoch": 0.048507462686567165, | |
| "grad_norm": 0.4152548909187317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7479823231697083, | |
| "mean_token_accuracy": 0.7288273125886917, | |
| "num_tokens": 211604.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7370023280382156, | |
| "epoch": 0.05223880597014925, | |
| "grad_norm": 0.38331395387649536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7293781638145447, | |
| "mean_token_accuracy": 0.7328485548496246, | |
| "num_tokens": 228114.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.6818548142910004, | |
| "epoch": 0.055970149253731345, | |
| "grad_norm": 0.4065186679363251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6948679685592651, | |
| "mean_token_accuracy": 0.7417702227830887, | |
| "num_tokens": 244615.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.6801213175058365, | |
| "epoch": 0.05970149253731343, | |
| "grad_norm": 0.3765408992767334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6942192316055298, | |
| "mean_token_accuracy": 0.7383946776390076, | |
| "num_tokens": 260940.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6828830540180206, | |
| "epoch": 0.06343283582089553, | |
| "grad_norm": 0.31789109110832214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6663458347320557, | |
| "mean_token_accuracy": 0.7480802536010742, | |
| "num_tokens": 277198.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6609166115522385, | |
| "epoch": 0.06716417910447761, | |
| "grad_norm": 0.3814696669578552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6373794078826904, | |
| "mean_token_accuracy": 0.7566290199756622, | |
| "num_tokens": 293415.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6822013854980469, | |
| "epoch": 0.0708955223880597, | |
| "grad_norm": 0.3390759527683258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6543835997581482, | |
| "mean_token_accuracy": 0.7451244294643402, | |
| "num_tokens": 309815.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.632593423128128, | |
| "epoch": 0.07462686567164178, | |
| "grad_norm": 0.41862595081329346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6299830675125122, | |
| "mean_token_accuracy": 0.7534051537513733, | |
| "num_tokens": 326057.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6358507871627808, | |
| "epoch": 0.07835820895522388, | |
| "grad_norm": 0.30084753036499023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.62652587890625, | |
| "mean_token_accuracy": 0.7561640441417694, | |
| "num_tokens": 342366.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.601889356970787, | |
| "epoch": 0.08208955223880597, | |
| "grad_norm": 0.30453744530677795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5936654210090637, | |
| "mean_token_accuracy": 0.7655821740627289, | |
| "num_tokens": 358935.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.5926243662834167, | |
| "epoch": 0.08582089552238806, | |
| "grad_norm": 0.24678799510002136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5894668698310852, | |
| "mean_token_accuracy": 0.7695567756891251, | |
| "num_tokens": 375125.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.5948957055807114, | |
| "epoch": 0.08955223880597014, | |
| "grad_norm": 0.26838821172714233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5975726246833801, | |
| "mean_token_accuracy": 0.766963854432106, | |
| "num_tokens": 391519.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.5925572067499161, | |
| "epoch": 0.09328358208955224, | |
| "grad_norm": 0.24850629270076752, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5895435214042664, | |
| "mean_token_accuracy": 0.7683891654014587, | |
| "num_tokens": 408003.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.579643040895462, | |
| "epoch": 0.09701492537313433, | |
| "grad_norm": 0.24649304151535034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5773741006851196, | |
| "mean_token_accuracy": 0.7704576104879379, | |
| "num_tokens": 424170.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.579850047826767, | |
| "epoch": 0.10074626865671642, | |
| "grad_norm": 0.24893403053283691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705626010894775, | |
| "mean_token_accuracy": 0.7733898609876633, | |
| "num_tokens": 440584.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.5937480330467224, | |
| "epoch": 0.1044776119402985, | |
| "grad_norm": 0.222214013338089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.584485650062561, | |
| "mean_token_accuracy": 0.7649911344051361, | |
| "num_tokens": 456887.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.5631287395954132, | |
| "epoch": 0.10820895522388059, | |
| "grad_norm": 0.26287850737571716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559370219707489, | |
| "mean_token_accuracy": 0.7786488234996796, | |
| "num_tokens": 473285.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5510498583316803, | |
| "epoch": 0.11194029850746269, | |
| "grad_norm": 0.2989422380924225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596640110015869, | |
| "mean_token_accuracy": 0.7761659324169159, | |
| "num_tokens": 489394.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5780725926160812, | |
| "epoch": 0.11567164179104478, | |
| "grad_norm": 0.23725202679634094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5835093259811401, | |
| "mean_token_accuracy": 0.7684815227985382, | |
| "num_tokens": 505756.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5761191546916962, | |
| "epoch": 0.11940298507462686, | |
| "grad_norm": 0.2031526267528534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5835364460945129, | |
| "mean_token_accuracy": 0.7682848125696182, | |
| "num_tokens": 522094.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5485773086547852, | |
| "epoch": 0.12313432835820895, | |
| "grad_norm": 0.20444567501544952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546419620513916, | |
| "mean_token_accuracy": 0.777488186955452, | |
| "num_tokens": 538415.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5861198753118515, | |
| "epoch": 0.12686567164179105, | |
| "grad_norm": 0.21942971646785736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5825690031051636, | |
| "mean_token_accuracy": 0.7697215527296066, | |
| "num_tokens": 554886.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5715848505496979, | |
| "epoch": 0.13059701492537312, | |
| "grad_norm": 0.20764704048633575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570915162563324, | |
| "mean_token_accuracy": 0.7720184922218323, | |
| "num_tokens": 571367.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5560943633317947, | |
| "epoch": 0.13432835820895522, | |
| "grad_norm": 0.20819340646266937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549942851066589, | |
| "mean_token_accuracy": 0.7778844088315964, | |
| "num_tokens": 587594.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.556964784860611, | |
| "epoch": 0.13805970149253732, | |
| "grad_norm": 0.17859336733818054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563804507255554, | |
| "mean_token_accuracy": 0.7767369300127029, | |
| "num_tokens": 604052.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5532324761152267, | |
| "epoch": 0.1417910447761194, | |
| "grad_norm": 0.18194721639156342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552038550376892, | |
| "mean_token_accuracy": 0.7764725238084793, | |
| "num_tokens": 620200.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5707972347736359, | |
| "epoch": 0.1455223880597015, | |
| "grad_norm": 0.17879748344421387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568923830986023, | |
| "mean_token_accuracy": 0.7714048773050308, | |
| "num_tokens": 636528.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5603279024362564, | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 0.19374136626720428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574957728385925, | |
| "mean_token_accuracy": 0.7773427516222, | |
| "num_tokens": 652629.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5546282231807709, | |
| "epoch": 0.15298507462686567, | |
| "grad_norm": 0.19636894762516022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532153844833374, | |
| "mean_token_accuracy": 0.7793182134628296, | |
| "num_tokens": 668683.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5812623649835587, | |
| "epoch": 0.15671641791044777, | |
| "grad_norm": 0.17162267863750458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755793452262878, | |
| "mean_token_accuracy": 0.7692758589982986, | |
| "num_tokens": 685277.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5617634505033493, | |
| "epoch": 0.16044776119402984, | |
| "grad_norm": 0.16276565194129944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628421306610107, | |
| "mean_token_accuracy": 0.7769913524389267, | |
| "num_tokens": 701728.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5570202618837357, | |
| "epoch": 0.16417910447761194, | |
| "grad_norm": 0.16841551661491394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597431659698486, | |
| "mean_token_accuracy": 0.7756171226501465, | |
| "num_tokens": 718323.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5491841286420822, | |
| "epoch": 0.16791044776119404, | |
| "grad_norm": 0.14662496745586395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556524991989136, | |
| "mean_token_accuracy": 0.7775459736585617, | |
| "num_tokens": 734628.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5427970439195633, | |
| "epoch": 0.17164179104477612, | |
| "grad_norm": 0.13948297500610352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476619601249695, | |
| "mean_token_accuracy": 0.7795768678188324, | |
| "num_tokens": 750996.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5452166348695755, | |
| "epoch": 0.17537313432835822, | |
| "grad_norm": 0.17319753766059875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554689168930054, | |
| "mean_token_accuracy": 0.7776593416929245, | |
| "num_tokens": 767284.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5613571405410767, | |
| "epoch": 0.1791044776119403, | |
| "grad_norm": 0.15226703882217407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640038847923279, | |
| "mean_token_accuracy": 0.7746699303388596, | |
| "num_tokens": 783601.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5535127073526382, | |
| "epoch": 0.1828358208955224, | |
| "grad_norm": 0.166432186961174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462499856948853, | |
| "mean_token_accuracy": 0.7813286185264587, | |
| "num_tokens": 799773.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5604032725095749, | |
| "epoch": 0.1865671641791045, | |
| "grad_norm": 0.17004649341106415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530112981796265, | |
| "mean_token_accuracy": 0.7776568233966827, | |
| "num_tokens": 816032.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5409559532999992, | |
| "epoch": 0.19029850746268656, | |
| "grad_norm": 0.14887484908103943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343962907791138, | |
| "mean_token_accuracy": 0.7841377556324005, | |
| "num_tokens": 832227.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5414481312036514, | |
| "epoch": 0.19402985074626866, | |
| "grad_norm": 0.20319198071956635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386375188827515, | |
| "mean_token_accuracy": 0.7845792174339294, | |
| "num_tokens": 848643.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5497538298368454, | |
| "epoch": 0.19776119402985073, | |
| "grad_norm": 0.16608890891075134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512281656265259, | |
| "mean_token_accuracy": 0.7805987298488617, | |
| "num_tokens": 865199.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.545375257730484, | |
| "epoch": 0.20149253731343283, | |
| "grad_norm": 0.17525805532932281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542587637901306, | |
| "mean_token_accuracy": 0.7773701697587967, | |
| "num_tokens": 881379.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5477564036846161, | |
| "epoch": 0.20522388059701493, | |
| "grad_norm": 0.19050806760787964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655733942985535, | |
| "mean_token_accuracy": 0.7745383828878403, | |
| "num_tokens": 897934.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5568059235811234, | |
| "epoch": 0.208955223880597, | |
| "grad_norm": 0.16148774325847626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592997074127197, | |
| "mean_token_accuracy": 0.7772074788808823, | |
| "num_tokens": 914308.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5678450167179108, | |
| "epoch": 0.2126865671641791, | |
| "grad_norm": 0.16515380144119263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.569266676902771, | |
| "mean_token_accuracy": 0.7714356333017349, | |
| "num_tokens": 930508.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.580150917172432, | |
| "epoch": 0.21641791044776118, | |
| "grad_norm": 0.17066031694412231, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5749757289886475, | |
| "mean_token_accuracy": 0.7655356675386429, | |
| "num_tokens": 946877.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5695585310459137, | |
| "epoch": 0.22014925373134328, | |
| "grad_norm": 0.16599293053150177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635928511619568, | |
| "mean_token_accuracy": 0.7739954739809036, | |
| "num_tokens": 963218.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5330293923616409, | |
| "epoch": 0.22388059701492538, | |
| "grad_norm": 0.14891624450683594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344960689544678, | |
| "mean_token_accuracy": 0.7841218858957291, | |
| "num_tokens": 979460.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5383697599172592, | |
| "epoch": 0.22761194029850745, | |
| "grad_norm": 0.16252915561199188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413715243339539, | |
| "mean_token_accuracy": 0.7826660871505737, | |
| "num_tokens": 995619.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5535406023263931, | |
| "epoch": 0.23134328358208955, | |
| "grad_norm": 0.15229789912700653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558712899684906, | |
| "mean_token_accuracy": 0.7769492119550705, | |
| "num_tokens": 1011885.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5603247284889221, | |
| "epoch": 0.23507462686567165, | |
| "grad_norm": 0.14967045187950134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645769834518433, | |
| "mean_token_accuracy": 0.771862581372261, | |
| "num_tokens": 1028352.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.563384547829628, | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 0.15884719789028168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637681484222412, | |
| "mean_token_accuracy": 0.7742781639099121, | |
| "num_tokens": 1044550.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5692009180784225, | |
| "epoch": 0.24253731343283583, | |
| "grad_norm": 0.16877400875091553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609120726585388, | |
| "mean_token_accuracy": 0.7724380940198898, | |
| "num_tokens": 1060869.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5652668327093124, | |
| "epoch": 0.2462686567164179, | |
| "grad_norm": 0.14263105392456055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577319264411926, | |
| "mean_token_accuracy": 0.7767308205366135, | |
| "num_tokens": 1077318.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5624865591526031, | |
| "epoch": 0.25, | |
| "grad_norm": 0.1326468139886856, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610349774360657, | |
| "mean_token_accuracy": 0.7767885029315948, | |
| "num_tokens": 1093946.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5453900694847107, | |
| "epoch": 0.2537313432835821, | |
| "grad_norm": 0.15602754056453705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474068522453308, | |
| "mean_token_accuracy": 0.7804547101259232, | |
| "num_tokens": 1110166.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5495888441801071, | |
| "epoch": 0.2574626865671642, | |
| "grad_norm": 0.16421914100646973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586546063423157, | |
| "mean_token_accuracy": 0.7761986404657364, | |
| "num_tokens": 1126524.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5564677566289902, | |
| "epoch": 0.26119402985074625, | |
| "grad_norm": 0.17955079674720764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570371687412262, | |
| "mean_token_accuracy": 0.7711490094661713, | |
| "num_tokens": 1142935.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5473903864622116, | |
| "epoch": 0.26492537313432835, | |
| "grad_norm": 0.14180611073970795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549370527267456, | |
| "mean_token_accuracy": 0.7789817303419113, | |
| "num_tokens": 1159182.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5544993579387665, | |
| "epoch": 0.26865671641791045, | |
| "grad_norm": 0.1569361388683319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507487058639526, | |
| "mean_token_accuracy": 0.7766937166452408, | |
| "num_tokens": 1175525.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5662118345499039, | |
| "epoch": 0.27238805970149255, | |
| "grad_norm": 0.15652883052825928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632150173187256, | |
| "mean_token_accuracy": 0.7702545374631882, | |
| "num_tokens": 1191955.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5581929385662079, | |
| "epoch": 0.27611940298507465, | |
| "grad_norm": 0.1360681653022766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503684878349304, | |
| "mean_token_accuracy": 0.7764260619878769, | |
| "num_tokens": 1208034.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5687559396028519, | |
| "epoch": 0.2798507462686567, | |
| "grad_norm": 0.13728748261928558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678715109825134, | |
| "mean_token_accuracy": 0.7728003114461899, | |
| "num_tokens": 1224533.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5481379926204681, | |
| "epoch": 0.2835820895522388, | |
| "grad_norm": 0.16217739880084991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537081956863403, | |
| "mean_token_accuracy": 0.7751952260732651, | |
| "num_tokens": 1240962.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5639017820358276, | |
| "epoch": 0.2873134328358209, | |
| "grad_norm": 0.1611357033252716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5741861462593079, | |
| "mean_token_accuracy": 0.7681055814027786, | |
| "num_tokens": 1257195.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5481198877096176, | |
| "epoch": 0.291044776119403, | |
| "grad_norm": 0.12783770263195038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473082065582275, | |
| "mean_token_accuracy": 0.777423769235611, | |
| "num_tokens": 1273603.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.539246067404747, | |
| "epoch": 0.2947761194029851, | |
| "grad_norm": 0.1314576119184494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311948657035828, | |
| "mean_token_accuracy": 0.7861492037773132, | |
| "num_tokens": 1289837.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.554696649312973, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.1476278305053711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538964867591858, | |
| "mean_token_accuracy": 0.7750344574451447, | |
| "num_tokens": 1306338.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5469587296247482, | |
| "epoch": 0.30223880597014924, | |
| "grad_norm": 0.16194719076156616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554472804069519, | |
| "mean_token_accuracy": 0.7799090445041656, | |
| "num_tokens": 1322825.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5433253645896912, | |
| "epoch": 0.30597014925373134, | |
| "grad_norm": 0.16987131536006927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523664355278015, | |
| "mean_token_accuracy": 0.776031419634819, | |
| "num_tokens": 1338865.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5386127680540085, | |
| "epoch": 0.30970149253731344, | |
| "grad_norm": 0.14176225662231445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489001870155334, | |
| "mean_token_accuracy": 0.7799653261899948, | |
| "num_tokens": 1355248.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5415250957012177, | |
| "epoch": 0.31343283582089554, | |
| "grad_norm": 0.17086099088191986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545318067073822, | |
| "mean_token_accuracy": 0.7825302183628082, | |
| "num_tokens": 1371746.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5727111548185349, | |
| "epoch": 0.31716417910447764, | |
| "grad_norm": 0.15196099877357483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5717822909355164, | |
| "mean_token_accuracy": 0.769862562417984, | |
| "num_tokens": 1388201.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5487467050552368, | |
| "epoch": 0.3208955223880597, | |
| "grad_norm": 0.12406057119369507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426313877105713, | |
| "mean_token_accuracy": 0.7817563712596893, | |
| "num_tokens": 1404461.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5417477786540985, | |
| "epoch": 0.3246268656716418, | |
| "grad_norm": 0.1868571937084198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441780090332031, | |
| "mean_token_accuracy": 0.7824695259332657, | |
| "num_tokens": 1420484.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.552739754319191, | |
| "epoch": 0.3283582089552239, | |
| "grad_norm": 0.12260660529136658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459186434745789, | |
| "mean_token_accuracy": 0.7800513356924057, | |
| "num_tokens": 1436981.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5539838075637817, | |
| "epoch": 0.332089552238806, | |
| "grad_norm": 0.19637417793273926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502506494522095, | |
| "mean_token_accuracy": 0.779677152633667, | |
| "num_tokens": 1453360.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5393257439136505, | |
| "epoch": 0.3358208955223881, | |
| "grad_norm": 0.14825744926929474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465800762176514, | |
| "mean_token_accuracy": 0.7785906046628952, | |
| "num_tokens": 1469575.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5319312065839767, | |
| "epoch": 0.33955223880597013, | |
| "grad_norm": 0.1817854791879654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348737835884094, | |
| "mean_token_accuracy": 0.7835152447223663, | |
| "num_tokens": 1485763.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5510641485452652, | |
| "epoch": 0.34328358208955223, | |
| "grad_norm": 0.1455191969871521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464341044425964, | |
| "mean_token_accuracy": 0.7820889949798584, | |
| "num_tokens": 1502105.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5406191498041153, | |
| "epoch": 0.34701492537313433, | |
| "grad_norm": 0.1273794025182724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421090722084045, | |
| "mean_token_accuracy": 0.7849924713373184, | |
| "num_tokens": 1518477.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5232429951429367, | |
| "epoch": 0.35074626865671643, | |
| "grad_norm": 0.14684391021728516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232130289077759, | |
| "mean_token_accuracy": 0.7893925607204437, | |
| "num_tokens": 1534743.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5493894517421722, | |
| "epoch": 0.35447761194029853, | |
| "grad_norm": 0.12976326048374176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556308627128601, | |
| "mean_token_accuracy": 0.7738792598247528, | |
| "num_tokens": 1551015.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5568605363368988, | |
| "epoch": 0.3582089552238806, | |
| "grad_norm": 0.15545816719532013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611149668693542, | |
| "mean_token_accuracy": 0.7729773372411728, | |
| "num_tokens": 1567597.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.554488942027092, | |
| "epoch": 0.3619402985074627, | |
| "grad_norm": 0.1307706981897354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501843094825745, | |
| "mean_token_accuracy": 0.7798233777284622, | |
| "num_tokens": 1583851.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5296479314565659, | |
| "epoch": 0.3656716417910448, | |
| "grad_norm": 0.1413222700357437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348843932151794, | |
| "mean_token_accuracy": 0.7847397029399872, | |
| "num_tokens": 1599880.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5739381164312363, | |
| "epoch": 0.3694029850746269, | |
| "grad_norm": 0.14992888271808624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5711988210678101, | |
| "mean_token_accuracy": 0.769414946436882, | |
| "num_tokens": 1616161.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5500659346580505, | |
| "epoch": 0.373134328358209, | |
| "grad_norm": 0.13987883925437927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535774230957031, | |
| "mean_token_accuracy": 0.7796037644147873, | |
| "num_tokens": 1632650.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5421769469976425, | |
| "epoch": 0.376865671641791, | |
| "grad_norm": 0.14819589257240295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429503917694092, | |
| "mean_token_accuracy": 0.7809022516012192, | |
| "num_tokens": 1649147.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5444748848676682, | |
| "epoch": 0.3805970149253731, | |
| "grad_norm": 0.15763095021247864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527257919311523, | |
| "mean_token_accuracy": 0.7789772897958755, | |
| "num_tokens": 1665434.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5364149361848831, | |
| "epoch": 0.3843283582089552, | |
| "grad_norm": 0.12937362492084503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445730090141296, | |
| "mean_token_accuracy": 0.7801977097988129, | |
| "num_tokens": 1681628.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5520685017108917, | |
| "epoch": 0.3880597014925373, | |
| "grad_norm": 0.13224048912525177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565529465675354, | |
| "mean_token_accuracy": 0.7761769741773605, | |
| "num_tokens": 1698024.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5505486279726028, | |
| "epoch": 0.3917910447761194, | |
| "grad_norm": 0.12523634731769562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501624345779419, | |
| "mean_token_accuracy": 0.776427686214447, | |
| "num_tokens": 1714432.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5415863394737244, | |
| "epoch": 0.39552238805970147, | |
| "grad_norm": 0.12370901554822922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389205813407898, | |
| "mean_token_accuracy": 0.7835447043180466, | |
| "num_tokens": 1730701.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.535835400223732, | |
| "epoch": 0.39925373134328357, | |
| "grad_norm": 0.12875092029571533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339052081108093, | |
| "mean_token_accuracy": 0.7833075076341629, | |
| "num_tokens": 1747039.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5391292423009872, | |
| "epoch": 0.40298507462686567, | |
| "grad_norm": 0.13361512124538422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480363368988037, | |
| "mean_token_accuracy": 0.778292641043663, | |
| "num_tokens": 1763231.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5451123267412186, | |
| "epoch": 0.40671641791044777, | |
| "grad_norm": 0.12270035594701767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544527530670166, | |
| "mean_token_accuracy": 0.7805600017309189, | |
| "num_tokens": 1779643.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5353200137615204, | |
| "epoch": 0.41044776119402987, | |
| "grad_norm": 0.15249699354171753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540695309638977, | |
| "mean_token_accuracy": 0.7809852063655853, | |
| "num_tokens": 1795799.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5517745912075043, | |
| "epoch": 0.4141791044776119, | |
| "grad_norm": 0.13048961758613586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428080558776855, | |
| "mean_token_accuracy": 0.7799961864948273, | |
| "num_tokens": 1812372.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5553679913282394, | |
| "epoch": 0.417910447761194, | |
| "grad_norm": 0.135862797498703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515741109848022, | |
| "mean_token_accuracy": 0.7762576192617416, | |
| "num_tokens": 1828663.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5415378957986832, | |
| "epoch": 0.4216417910447761, | |
| "grad_norm": 0.17365720868110657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439163446426392, | |
| "mean_token_accuracy": 0.7816168814897537, | |
| "num_tokens": 1845046.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5443854928016663, | |
| "epoch": 0.4253731343283582, | |
| "grad_norm": 0.13225306570529938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523333549499512, | |
| "mean_token_accuracy": 0.7754887640476227, | |
| "num_tokens": 1861463.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.536818191409111, | |
| "epoch": 0.4291044776119403, | |
| "grad_norm": 0.18661700189113617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445066094398499, | |
| "mean_token_accuracy": 0.7783756703138351, | |
| "num_tokens": 1877488.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5401700437068939, | |
| "epoch": 0.43283582089552236, | |
| "grad_norm": 0.1313197761774063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441405773162842, | |
| "mean_token_accuracy": 0.779263436794281, | |
| "num_tokens": 1893953.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5655902773141861, | |
| "epoch": 0.43656716417910446, | |
| "grad_norm": 0.14134129881858826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561054944992065, | |
| "mean_token_accuracy": 0.7760706096887589, | |
| "num_tokens": 1910559.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5377545058727264, | |
| "epoch": 0.44029850746268656, | |
| "grad_norm": 0.1476624757051468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377650260925293, | |
| "mean_token_accuracy": 0.784254401922226, | |
| "num_tokens": 1926798.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5710994154214859, | |
| "epoch": 0.44402985074626866, | |
| "grad_norm": 0.12695498764514923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705847144126892, | |
| "mean_token_accuracy": 0.7709101736545563, | |
| "num_tokens": 1943309.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5473001599311829, | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 0.13190272450447083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527402758598328, | |
| "mean_token_accuracy": 0.7776251584291458, | |
| "num_tokens": 1959914.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5332797467708588, | |
| "epoch": 0.45149253731343286, | |
| "grad_norm": 0.1538720279932022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541407585144043, | |
| "mean_token_accuracy": 0.7805240601301193, | |
| "num_tokens": 1976350.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5485477149486542, | |
| "epoch": 0.4552238805970149, | |
| "grad_norm": 0.1464855819940567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562998056411743, | |
| "mean_token_accuracy": 0.7745071202516556, | |
| "num_tokens": 1992575.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5465153902769089, | |
| "epoch": 0.458955223880597, | |
| "grad_norm": 0.1392602175474167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450125932693481, | |
| "mean_token_accuracy": 0.7803204655647278, | |
| "num_tokens": 2008818.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5216257721185684, | |
| "epoch": 0.4626865671641791, | |
| "grad_norm": 0.16500917077064514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5204989314079285, | |
| "mean_token_accuracy": 0.7916441410779953, | |
| "num_tokens": 2024909.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5582488030195236, | |
| "epoch": 0.4664179104477612, | |
| "grad_norm": 0.12797319889068604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522317290306091, | |
| "mean_token_accuracy": 0.7782706022262573, | |
| "num_tokens": 2041274.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5451529324054718, | |
| "epoch": 0.4701492537313433, | |
| "grad_norm": 0.136440709233284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448014736175537, | |
| "mean_token_accuracy": 0.7787207514047623, | |
| "num_tokens": 2057665.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5657823532819748, | |
| "epoch": 0.47388059701492535, | |
| "grad_norm": 0.13369601964950562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634066462516785, | |
| "mean_token_accuracy": 0.7729785293340683, | |
| "num_tokens": 2074159.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.52435402572155, | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.13124150037765503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261214971542358, | |
| "mean_token_accuracy": 0.787582278251648, | |
| "num_tokens": 2090388.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5388573259115219, | |
| "epoch": 0.48134328358208955, | |
| "grad_norm": 0.1402949094772339, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444526672363281, | |
| "mean_token_accuracy": 0.780138373374939, | |
| "num_tokens": 2106895.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5594224631786346, | |
| "epoch": 0.48507462686567165, | |
| "grad_norm": 0.12214766442775726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5680845379829407, | |
| "mean_token_accuracy": 0.7693810015916824, | |
| "num_tokens": 2122936.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5598264634609222, | |
| "epoch": 0.48880597014925375, | |
| "grad_norm": 0.11836589127779007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608173608779907, | |
| "mean_token_accuracy": 0.7735486477613449, | |
| "num_tokens": 2139356.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5484192073345184, | |
| "epoch": 0.4925373134328358, | |
| "grad_norm": 0.11776985228061676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445444583892822, | |
| "mean_token_accuracy": 0.7797606885433197, | |
| "num_tokens": 2155868.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5602923631668091, | |
| "epoch": 0.4962686567164179, | |
| "grad_norm": 0.12020131945610046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522936582565308, | |
| "mean_token_accuracy": 0.7776170521974564, | |
| "num_tokens": 2172336.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5583924055099487, | |
| "epoch": 0.5, | |
| "grad_norm": 0.1295275241136551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5662660002708435, | |
| "mean_token_accuracy": 0.7716575860977173, | |
| "num_tokens": 2188518.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5514810979366302, | |
| "epoch": 0.503731343283582, | |
| "grad_norm": 0.1089273989200592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514034032821655, | |
| "mean_token_accuracy": 0.7769223898649216, | |
| "num_tokens": 2205142.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5440865606069565, | |
| "epoch": 0.5074626865671642, | |
| "grad_norm": 0.13056722283363342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475744009017944, | |
| "mean_token_accuracy": 0.7764044553041458, | |
| "num_tokens": 2221743.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5476541817188263, | |
| "epoch": 0.5111940298507462, | |
| "grad_norm": 0.13166996836662292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477900505065918, | |
| "mean_token_accuracy": 0.7784378528594971, | |
| "num_tokens": 2238142.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5558486729860306, | |
| "epoch": 0.5149253731343284, | |
| "grad_norm": 0.12133946269750595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609108209609985, | |
| "mean_token_accuracy": 0.7736046612262726, | |
| "num_tokens": 2254456.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5566332340240479, | |
| "epoch": 0.5186567164179104, | |
| "grad_norm": 0.12148908525705338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561110973358154, | |
| "mean_token_accuracy": 0.7756631374359131, | |
| "num_tokens": 2270696.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5462600067257881, | |
| "epoch": 0.5223880597014925, | |
| "grad_norm": 0.1129021942615509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448604822158813, | |
| "mean_token_accuracy": 0.7795793265104294, | |
| "num_tokens": 2287025.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5399314314126968, | |
| "epoch": 0.5261194029850746, | |
| "grad_norm": 0.1251847892999649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481414794921875, | |
| "mean_token_accuracy": 0.778893768787384, | |
| "num_tokens": 2303399.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5469618439674377, | |
| "epoch": 0.5298507462686567, | |
| "grad_norm": 0.11956755071878433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474820137023926, | |
| "mean_token_accuracy": 0.7784739285707474, | |
| "num_tokens": 2319818.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5447351336479187, | |
| "epoch": 0.5335820895522388, | |
| "grad_norm": 0.14881564676761627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410581827163696, | |
| "mean_token_accuracy": 0.781320258975029, | |
| "num_tokens": 2335949.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5449966341257095, | |
| "epoch": 0.5373134328358209, | |
| "grad_norm": 0.12103825062513351, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471005439758301, | |
| "mean_token_accuracy": 0.7796377539634705, | |
| "num_tokens": 2352269.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5632765144109726, | |
| "epoch": 0.5410447761194029, | |
| "grad_norm": 0.12277977168560028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5630727410316467, | |
| "mean_token_accuracy": 0.7703763097524643, | |
| "num_tokens": 2368674.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5339089632034302, | |
| "epoch": 0.5447761194029851, | |
| "grad_norm": 0.14498627185821533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364416241645813, | |
| "mean_token_accuracy": 0.7819968014955521, | |
| "num_tokens": 2384936.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5429459661245346, | |
| "epoch": 0.5485074626865671, | |
| "grad_norm": 0.12051384150981903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456188917160034, | |
| "mean_token_accuracy": 0.7803860902786255, | |
| "num_tokens": 2401292.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5626052618026733, | |
| "epoch": 0.5522388059701493, | |
| "grad_norm": 0.1412496566772461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596410036087036, | |
| "mean_token_accuracy": 0.7737385481595993, | |
| "num_tokens": 2417925.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5565475225448608, | |
| "epoch": 0.5559701492537313, | |
| "grad_norm": 0.1441730409860611, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520785450935364, | |
| "mean_token_accuracy": 0.775386318564415, | |
| "num_tokens": 2434621.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5197634100914001, | |
| "epoch": 0.5597014925373134, | |
| "grad_norm": 0.12098351866006851, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222008228302002, | |
| "mean_token_accuracy": 0.7903124392032623, | |
| "num_tokens": 2450903.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5489796698093414, | |
| "epoch": 0.5634328358208955, | |
| "grad_norm": 0.14946326613426208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559377074241638, | |
| "mean_token_accuracy": 0.775105893611908, | |
| "num_tokens": 2467105.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5400301665067673, | |
| "epoch": 0.5671641791044776, | |
| "grad_norm": 0.12906025350093842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420807600021362, | |
| "mean_token_accuracy": 0.7795381844043732, | |
| "num_tokens": 2483456.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5474328249692917, | |
| "epoch": 0.5708955223880597, | |
| "grad_norm": 0.12021685391664505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509780049324036, | |
| "mean_token_accuracy": 0.7768895477056503, | |
| "num_tokens": 2500011.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5380930155515671, | |
| "epoch": 0.5746268656716418, | |
| "grad_norm": 0.11843080073595047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308334827423096, | |
| "mean_token_accuracy": 0.7881843447685242, | |
| "num_tokens": 2516780.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5460693091154099, | |
| "epoch": 0.5783582089552238, | |
| "grad_norm": 0.16729064285755157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530881285667419, | |
| "mean_token_accuracy": 0.7742334753274918, | |
| "num_tokens": 2532837.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5500553995370865, | |
| "epoch": 0.582089552238806, | |
| "grad_norm": 0.1366872787475586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533830523490906, | |
| "mean_token_accuracy": 0.7750078588724136, | |
| "num_tokens": 2549157.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5497538447380066, | |
| "epoch": 0.585820895522388, | |
| "grad_norm": 0.12214312702417374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549652576446533, | |
| "mean_token_accuracy": 0.7742869108915329, | |
| "num_tokens": 2565745.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5520212799310684, | |
| "epoch": 0.5895522388059702, | |
| "grad_norm": 0.13198687136173248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503985285758972, | |
| "mean_token_accuracy": 0.7776314318180084, | |
| "num_tokens": 2582172.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5420894026756287, | |
| "epoch": 0.5932835820895522, | |
| "grad_norm": 0.1303817480802536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545700192451477, | |
| "mean_token_accuracy": 0.7790375500917435, | |
| "num_tokens": 2598785.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5361281335353851, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.13537634909152985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409078598022461, | |
| "mean_token_accuracy": 0.779214471578598, | |
| "num_tokens": 2615324.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5633385479450226, | |
| "epoch": 0.6007462686567164, | |
| "grad_norm": 0.11204258352518082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624291896820068, | |
| "mean_token_accuracy": 0.7730776518583298, | |
| "num_tokens": 2631612.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5239899605512619, | |
| "epoch": 0.6044776119402985, | |
| "grad_norm": 0.14660899341106415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244404673576355, | |
| "mean_token_accuracy": 0.7870436310768127, | |
| "num_tokens": 2648098.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5414755046367645, | |
| "epoch": 0.6082089552238806, | |
| "grad_norm": 0.11887400597333908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397330522537231, | |
| "mean_token_accuracy": 0.7847625911235809, | |
| "num_tokens": 2664285.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5442674309015274, | |
| "epoch": 0.6119402985074627, | |
| "grad_norm": 0.11572780460119247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454840660095215, | |
| "mean_token_accuracy": 0.7809286564588547, | |
| "num_tokens": 2680551.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5371343344449997, | |
| "epoch": 0.6156716417910447, | |
| "grad_norm": 0.1056356355547905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348964929580688, | |
| "mean_token_accuracy": 0.7857467532157898, | |
| "num_tokens": 2697071.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5399870425462723, | |
| "epoch": 0.6194029850746269, | |
| "grad_norm": 0.13278594613075256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447728633880615, | |
| "mean_token_accuracy": 0.7792245298624039, | |
| "num_tokens": 2713461.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5350475907325745, | |
| "epoch": 0.6231343283582089, | |
| "grad_norm": 0.1305065155029297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362796783447266, | |
| "mean_token_accuracy": 0.7812380343675613, | |
| "num_tokens": 2729505.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5582499951124191, | |
| "epoch": 0.6268656716417911, | |
| "grad_norm": 0.12587526440620422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559293627738953, | |
| "mean_token_accuracy": 0.7746618837118149, | |
| "num_tokens": 2746287.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5586439073085785, | |
| "epoch": 0.6305970149253731, | |
| "grad_norm": 0.12845800817012787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518544912338257, | |
| "mean_token_accuracy": 0.7751341164112091, | |
| "num_tokens": 2762818.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5343242138624191, | |
| "epoch": 0.6343283582089553, | |
| "grad_norm": 0.15256647765636444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386060476303101, | |
| "mean_token_accuracy": 0.7807702422142029, | |
| "num_tokens": 2779199.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5373098105192184, | |
| "epoch": 0.6380597014925373, | |
| "grad_norm": 0.13263238966464996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466636419296265, | |
| "mean_token_accuracy": 0.7765426337718964, | |
| "num_tokens": 2795330.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5298089534044266, | |
| "epoch": 0.6417910447761194, | |
| "grad_norm": 0.12450744211673737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325064659118652, | |
| "mean_token_accuracy": 0.7838508486747742, | |
| "num_tokens": 2811566.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5550331622362137, | |
| "epoch": 0.6455223880597015, | |
| "grad_norm": 0.111052505671978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552961230278015, | |
| "mean_token_accuracy": 0.7752347737550735, | |
| "num_tokens": 2827783.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5506296455860138, | |
| "epoch": 0.6492537313432836, | |
| "grad_norm": 0.13255524635314941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490573048591614, | |
| "mean_token_accuracy": 0.7767810970544815, | |
| "num_tokens": 2844210.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5634674280881882, | |
| "epoch": 0.6529850746268657, | |
| "grad_norm": 0.11786694079637527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620654225349426, | |
| "mean_token_accuracy": 0.7710569798946381, | |
| "num_tokens": 2860606.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5491903871297836, | |
| "epoch": 0.6567164179104478, | |
| "grad_norm": 0.1378813087940216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544133186340332, | |
| "mean_token_accuracy": 0.7742699533700943, | |
| "num_tokens": 2876978.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5418348163366318, | |
| "epoch": 0.6604477611940298, | |
| "grad_norm": 0.1719319373369217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509114265441895, | |
| "mean_token_accuracy": 0.7738531082868576, | |
| "num_tokens": 2893436.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5621145367622375, | |
| "epoch": 0.664179104477612, | |
| "grad_norm": 0.13473528623580933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569881796836853, | |
| "mean_token_accuracy": 0.7752742022275925, | |
| "num_tokens": 2909714.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5567401647567749, | |
| "epoch": 0.667910447761194, | |
| "grad_norm": 0.15127326548099518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531461238861084, | |
| "mean_token_accuracy": 0.7789575010538101, | |
| "num_tokens": 2926148.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5276759713888168, | |
| "epoch": 0.6716417910447762, | |
| "grad_norm": 0.1254606693983078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301634669303894, | |
| "mean_token_accuracy": 0.7837289869785309, | |
| "num_tokens": 2942739.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5278603881597519, | |
| "epoch": 0.6753731343283582, | |
| "grad_norm": 0.128974050283432, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369632244110107, | |
| "mean_token_accuracy": 0.7825482338666916, | |
| "num_tokens": 2958977.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5377722084522247, | |
| "epoch": 0.6791044776119403, | |
| "grad_norm": 0.13316886126995087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483193397521973, | |
| "mean_token_accuracy": 0.7763564735651016, | |
| "num_tokens": 2975274.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5507437884807587, | |
| "epoch": 0.6828358208955224, | |
| "grad_norm": 0.12445816397666931, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532326698303223, | |
| "mean_token_accuracy": 0.7756502628326416, | |
| "num_tokens": 2991599.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5495483875274658, | |
| "epoch": 0.6865671641791045, | |
| "grad_norm": 0.11616785079240799, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388738512992859, | |
| "mean_token_accuracy": 0.780926913022995, | |
| "num_tokens": 3008127.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5604113638401031, | |
| "epoch": 0.6902985074626866, | |
| "grad_norm": 0.10933785885572433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567720532417297, | |
| "mean_token_accuracy": 0.7762922942638397, | |
| "num_tokens": 3024360.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5393257141113281, | |
| "epoch": 0.6940298507462687, | |
| "grad_norm": 0.13075008988380432, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377945303916931, | |
| "mean_token_accuracy": 0.7826398611068726, | |
| "num_tokens": 3040880.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.534931406378746, | |
| "epoch": 0.6977611940298507, | |
| "grad_norm": 0.11783911287784576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384173393249512, | |
| "mean_token_accuracy": 0.7814484983682632, | |
| "num_tokens": 3057215.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5480581521987915, | |
| "epoch": 0.7014925373134329, | |
| "grad_norm": 0.11767826229333878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535053610801697, | |
| "mean_token_accuracy": 0.7753477245569229, | |
| "num_tokens": 3073526.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5417313128709793, | |
| "epoch": 0.7052238805970149, | |
| "grad_norm": 0.1221914142370224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454643368721008, | |
| "mean_token_accuracy": 0.7766887843608856, | |
| "num_tokens": 3089677.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5625078678131104, | |
| "epoch": 0.7089552238805971, | |
| "grad_norm": 0.11974587291479111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611926913261414, | |
| "mean_token_accuracy": 0.7717815935611725, | |
| "num_tokens": 3105979.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5516901463270187, | |
| "epoch": 0.7126865671641791, | |
| "grad_norm": 0.11311069130897522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487813949584961, | |
| "mean_token_accuracy": 0.7764030396938324, | |
| "num_tokens": 3122320.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5541231781244278, | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.12345684319734573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585082173347473, | |
| "mean_token_accuracy": 0.774434968829155, | |
| "num_tokens": 3138647.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5558422803878784, | |
| "epoch": 0.7201492537313433, | |
| "grad_norm": 0.13054387271404266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540096163749695, | |
| "mean_token_accuracy": 0.7756641954183578, | |
| "num_tokens": 3154847.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.55143603682518, | |
| "epoch": 0.7238805970149254, | |
| "grad_norm": 0.14231973886489868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643096566200256, | |
| "mean_token_accuracy": 0.7717767059803009, | |
| "num_tokens": 3171336.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5277590304613113, | |
| "epoch": 0.7276119402985075, | |
| "grad_norm": 0.12328840047121048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327441096305847, | |
| "mean_token_accuracy": 0.7853522598743439, | |
| "num_tokens": 3187829.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5539046078920364, | |
| "epoch": 0.7313432835820896, | |
| "grad_norm": 0.12686993181705475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454736948013306, | |
| "mean_token_accuracy": 0.7813247591257095, | |
| "num_tokens": 3204100.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5553427636623383, | |
| "epoch": 0.7350746268656716, | |
| "grad_norm": 0.14084763824939728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538918972015381, | |
| "mean_token_accuracy": 0.7761572599411011, | |
| "num_tokens": 3220526.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5553955286741257, | |
| "epoch": 0.7388059701492538, | |
| "grad_norm": 0.15137532353401184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569421648979187, | |
| "mean_token_accuracy": 0.7751066386699677, | |
| "num_tokens": 3237005.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5306164473295212, | |
| "epoch": 0.7425373134328358, | |
| "grad_norm": 0.14029283821582794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325392484664917, | |
| "mean_token_accuracy": 0.7821047902107239, | |
| "num_tokens": 3253191.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5289445072412491, | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 0.1625203937292099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530889093875885, | |
| "mean_token_accuracy": 0.7839524000883102, | |
| "num_tokens": 3269303.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5537738502025604, | |
| "epoch": 0.75, | |
| "grad_norm": 0.12837141752243042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496644377708435, | |
| "mean_token_accuracy": 0.7775348573923111, | |
| "num_tokens": 3285861.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5437710881233215, | |
| "epoch": 0.753731343283582, | |
| "grad_norm": 0.15969154238700867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445458889007568, | |
| "mean_token_accuracy": 0.7779001444578171, | |
| "num_tokens": 3302531.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5435174703598022, | |
| "epoch": 0.7574626865671642, | |
| "grad_norm": 0.1447206735610962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419492125511169, | |
| "mean_token_accuracy": 0.782675564289093, | |
| "num_tokens": 3318918.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5337730944156647, | |
| "epoch": 0.7611940298507462, | |
| "grad_norm": 0.13017146289348602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400105714797974, | |
| "mean_token_accuracy": 0.7810544222593307, | |
| "num_tokens": 3335348.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.534254178404808, | |
| "epoch": 0.7649253731343284, | |
| "grad_norm": 0.11939690262079239, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358497500419617, | |
| "mean_token_accuracy": 0.7831085026264191, | |
| "num_tokens": 3351607.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5295046716928482, | |
| "epoch": 0.7686567164179104, | |
| "grad_norm": 0.17022010684013367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389232635498047, | |
| "mean_token_accuracy": 0.7811893969774246, | |
| "num_tokens": 3368046.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5532102882862091, | |
| "epoch": 0.7723880597014925, | |
| "grad_norm": 0.13207128643989563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556742787361145, | |
| "mean_token_accuracy": 0.7772794514894485, | |
| "num_tokens": 3384496.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5532752573490143, | |
| "epoch": 0.7761194029850746, | |
| "grad_norm": 0.16495952010154724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545544624328613, | |
| "mean_token_accuracy": 0.777538612484932, | |
| "num_tokens": 3400918.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.534032866358757, | |
| "epoch": 0.7798507462686567, | |
| "grad_norm": 0.1333177387714386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533141553401947, | |
| "mean_token_accuracy": 0.7848780155181885, | |
| "num_tokens": 3417300.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5450873523950577, | |
| "epoch": 0.7835820895522388, | |
| "grad_norm": 0.12406419962644577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425257086753845, | |
| "mean_token_accuracy": 0.781457707285881, | |
| "num_tokens": 3433516.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5520957857370377, | |
| "epoch": 0.7873134328358209, | |
| "grad_norm": 0.16319960355758667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528780817985535, | |
| "mean_token_accuracy": 0.7751211673021317, | |
| "num_tokens": 3449854.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5411545261740685, | |
| "epoch": 0.7910447761194029, | |
| "grad_norm": 0.11995123326778412, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378537178039551, | |
| "mean_token_accuracy": 0.7797028720378876, | |
| "num_tokens": 3466138.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5522632747888565, | |
| "epoch": 0.7947761194029851, | |
| "grad_norm": 0.14674413204193115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561342239379883, | |
| "mean_token_accuracy": 0.7742671966552734, | |
| "num_tokens": 3482443.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5423247516155243, | |
| "epoch": 0.7985074626865671, | |
| "grad_norm": 0.1413860321044922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450446605682373, | |
| "mean_token_accuracy": 0.7770555764436722, | |
| "num_tokens": 3498627.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5330623686313629, | |
| "epoch": 0.8022388059701493, | |
| "grad_norm": 0.1323142796754837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411436557769775, | |
| "mean_token_accuracy": 0.7801088243722916, | |
| "num_tokens": 3515028.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5561616569757462, | |
| "epoch": 0.8059701492537313, | |
| "grad_norm": 0.14549626410007477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557980537414551, | |
| "mean_token_accuracy": 0.774229571223259, | |
| "num_tokens": 3531502.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5611517131328583, | |
| "epoch": 0.8097014925373134, | |
| "grad_norm": 0.13433797657489777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634274482727051, | |
| "mean_token_accuracy": 0.7715686410665512, | |
| "num_tokens": 3547519.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5514582842588425, | |
| "epoch": 0.8134328358208955, | |
| "grad_norm": 0.11890087276697159, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433245897293091, | |
| "mean_token_accuracy": 0.7793933302164078, | |
| "num_tokens": 3563773.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.534797728061676, | |
| "epoch": 0.8171641791044776, | |
| "grad_norm": 0.1360422521829605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381568670272827, | |
| "mean_token_accuracy": 0.7809459120035172, | |
| "num_tokens": 3580120.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5429193377494812, | |
| "epoch": 0.8208955223880597, | |
| "grad_norm": 0.13077932596206665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535344481468201, | |
| "mean_token_accuracy": 0.7765921354293823, | |
| "num_tokens": 3596382.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5237333700060844, | |
| "epoch": 0.8246268656716418, | |
| "grad_norm": 0.1276118904352188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291868448257446, | |
| "mean_token_accuracy": 0.7849691659212112, | |
| "num_tokens": 3612537.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5639058351516724, | |
| "epoch": 0.8283582089552238, | |
| "grad_norm": 0.1108359843492508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600181221961975, | |
| "mean_token_accuracy": 0.7725061029195786, | |
| "num_tokens": 3629049.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5387094169855118, | |
| "epoch": 0.832089552238806, | |
| "grad_norm": 0.14372611045837402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452870726585388, | |
| "mean_token_accuracy": 0.7791440933942795, | |
| "num_tokens": 3645497.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5521352589130402, | |
| "epoch": 0.835820895522388, | |
| "grad_norm": 0.1448589414358139, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500624775886536, | |
| "mean_token_accuracy": 0.7766592055559158, | |
| "num_tokens": 3661916.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5495995134115219, | |
| "epoch": 0.8395522388059702, | |
| "grad_norm": 0.11583460122346878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486539006233215, | |
| "mean_token_accuracy": 0.77958944439888, | |
| "num_tokens": 3678385.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5483616590499878, | |
| "epoch": 0.8432835820895522, | |
| "grad_norm": 0.12950138747692108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550271213054657, | |
| "mean_token_accuracy": 0.7755987495183945, | |
| "num_tokens": 3694915.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5614336878061295, | |
| "epoch": 0.8470149253731343, | |
| "grad_norm": 0.1335671842098236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636512041091919, | |
| "mean_token_accuracy": 0.7719693928956985, | |
| "num_tokens": 3710911.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5516408532857895, | |
| "epoch": 0.8507462686567164, | |
| "grad_norm": 0.11091525852680206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478500127792358, | |
| "mean_token_accuracy": 0.7780372649431229, | |
| "num_tokens": 3727387.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5346055030822754, | |
| "epoch": 0.8544776119402985, | |
| "grad_norm": 0.1468094438314438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368551015853882, | |
| "mean_token_accuracy": 0.7816846072673798, | |
| "num_tokens": 3743610.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5556191802024841, | |
| "epoch": 0.8582089552238806, | |
| "grad_norm": 0.12531019747257233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554017961025238, | |
| "mean_token_accuracy": 0.775733008980751, | |
| "num_tokens": 3759900.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5382195562124252, | |
| "epoch": 0.8619402985074627, | |
| "grad_norm": 0.12708726525306702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370462536811829, | |
| "mean_token_accuracy": 0.7824227660894394, | |
| "num_tokens": 3776209.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5437551140785217, | |
| "epoch": 0.8656716417910447, | |
| "grad_norm": 0.14250780642032623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482578277587891, | |
| "mean_token_accuracy": 0.7775947004556656, | |
| "num_tokens": 3792690.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5299069508910179, | |
| "epoch": 0.8694029850746269, | |
| "grad_norm": 0.09997344017028809, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321590900421143, | |
| "mean_token_accuracy": 0.7849525660276413, | |
| "num_tokens": 3808996.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5415566265583038, | |
| "epoch": 0.8731343283582089, | |
| "grad_norm": 0.14475880563259125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407425165176392, | |
| "mean_token_accuracy": 0.7812676578760147, | |
| "num_tokens": 3825184.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5459320992231369, | |
| "epoch": 0.8768656716417911, | |
| "grad_norm": 0.1116221696138382, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546471118927002, | |
| "mean_token_accuracy": 0.779377743601799, | |
| "num_tokens": 3841452.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5291514843702316, | |
| "epoch": 0.8805970149253731, | |
| "grad_norm": 0.12996730208396912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327478647232056, | |
| "mean_token_accuracy": 0.7848521023988724, | |
| "num_tokens": 3858017.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5208889245986938, | |
| "epoch": 0.8843283582089553, | |
| "grad_norm": 0.16807906329631805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301882028579712, | |
| "mean_token_accuracy": 0.786228597164154, | |
| "num_tokens": 3874064.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5617295503616333, | |
| "epoch": 0.8880597014925373, | |
| "grad_norm": 0.10751146823167801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591222047805786, | |
| "mean_token_accuracy": 0.7737416923046112, | |
| "num_tokens": 3890590.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5473610609769821, | |
| "epoch": 0.8917910447761194, | |
| "grad_norm": 0.156968355178833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408577919006348, | |
| "mean_token_accuracy": 0.7787807583808899, | |
| "num_tokens": 3906796.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5521116256713867, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.1288469135761261, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549975574016571, | |
| "mean_token_accuracy": 0.7787336856126785, | |
| "num_tokens": 3923243.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5367736220359802, | |
| "epoch": 0.8992537313432836, | |
| "grad_norm": 0.15267081558704376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406203269958496, | |
| "mean_token_accuracy": 0.7823334783315659, | |
| "num_tokens": 3939802.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5384350121021271, | |
| "epoch": 0.9029850746268657, | |
| "grad_norm": 0.12661150097846985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470013618469238, | |
| "mean_token_accuracy": 0.7777878791093826, | |
| "num_tokens": 3956169.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.534332662820816, | |
| "epoch": 0.9067164179104478, | |
| "grad_norm": 0.1578921526670456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447706580162048, | |
| "mean_token_accuracy": 0.7791011482477188, | |
| "num_tokens": 3972588.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5489266514778137, | |
| "epoch": 0.9104477611940298, | |
| "grad_norm": 0.12818928062915802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481740236282349, | |
| "mean_token_accuracy": 0.7786219567060471, | |
| "num_tokens": 3988829.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5603043735027313, | |
| "epoch": 0.914179104477612, | |
| "grad_norm": 0.12620778381824493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473756194114685, | |
| "mean_token_accuracy": 0.7766416519880295, | |
| "num_tokens": 4005147.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5429242998361588, | |
| "epoch": 0.917910447761194, | |
| "grad_norm": 0.12476211786270142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349637269973755, | |
| "mean_token_accuracy": 0.7825885117053986, | |
| "num_tokens": 4021414.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5483033657073975, | |
| "epoch": 0.9216417910447762, | |
| "grad_norm": 0.12620662152767181, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528666973114014, | |
| "mean_token_accuracy": 0.7761824727058411, | |
| "num_tokens": 4038127.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5366939753293991, | |
| "epoch": 0.9253731343283582, | |
| "grad_norm": 0.14575915038585663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463760495185852, | |
| "mean_token_accuracy": 0.7789819538593292, | |
| "num_tokens": 4054823.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5289286822080612, | |
| "epoch": 0.9291044776119403, | |
| "grad_norm": 0.13227254152297974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342484712600708, | |
| "mean_token_accuracy": 0.7823342829942703, | |
| "num_tokens": 4071168.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5574782639741898, | |
| "epoch": 0.9328358208955224, | |
| "grad_norm": 0.11694958060979843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555205225944519, | |
| "mean_token_accuracy": 0.7752824872732162, | |
| "num_tokens": 4087486.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5487115234136581, | |
| "epoch": 0.9365671641791045, | |
| "grad_norm": 0.12190678715705872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393535494804382, | |
| "mean_token_accuracy": 0.7831632941961288, | |
| "num_tokens": 4103816.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.559577152132988, | |
| "epoch": 0.9402985074626866, | |
| "grad_norm": 0.17028383910655975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525080561637878, | |
| "mean_token_accuracy": 0.7758573293685913, | |
| "num_tokens": 4120222.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5650424063205719, | |
| "epoch": 0.9440298507462687, | |
| "grad_norm": 0.11132688075304031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637966394424438, | |
| "mean_token_accuracy": 0.7707894593477249, | |
| "num_tokens": 4136652.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5160737410187721, | |
| "epoch": 0.9477611940298507, | |
| "grad_norm": 0.15931887924671173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282326936721802, | |
| "mean_token_accuracy": 0.7854665815830231, | |
| "num_tokens": 4152947.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.537076398730278, | |
| "epoch": 0.9514925373134329, | |
| "grad_norm": 0.12814630568027496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451772809028625, | |
| "mean_token_accuracy": 0.7802058607339859, | |
| "num_tokens": 4169503.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5342639088630676, | |
| "epoch": 0.9552238805970149, | |
| "grad_norm": 0.1517118364572525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411078333854675, | |
| "mean_token_accuracy": 0.7786644250154495, | |
| "num_tokens": 4185621.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5415196269750595, | |
| "epoch": 0.9589552238805971, | |
| "grad_norm": 0.1379823535680771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376235842704773, | |
| "mean_token_accuracy": 0.782574325799942, | |
| "num_tokens": 4201870.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5464203655719757, | |
| "epoch": 0.9626865671641791, | |
| "grad_norm": 0.11068425327539444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408488512039185, | |
| "mean_token_accuracy": 0.780770868062973, | |
| "num_tokens": 4218151.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5458406358957291, | |
| "epoch": 0.9664179104477612, | |
| "grad_norm": 0.12213952839374542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443609952926636, | |
| "mean_token_accuracy": 0.7778299003839493, | |
| "num_tokens": 4234366.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5463070273399353, | |
| "epoch": 0.9701492537313433, | |
| "grad_norm": 0.13273894786834717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463058948516846, | |
| "mean_token_accuracy": 0.7797796875238419, | |
| "num_tokens": 4250736.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5530222281813622, | |
| "epoch": 0.9738805970149254, | |
| "grad_norm": 0.1269286721944809, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598427057266235, | |
| "mean_token_accuracy": 0.7720119059085846, | |
| "num_tokens": 4267145.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5307595282793045, | |
| "epoch": 0.9776119402985075, | |
| "grad_norm": 0.15041397511959076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379044413566589, | |
| "mean_token_accuracy": 0.7826298028230667, | |
| "num_tokens": 4283482.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5570843815803528, | |
| "epoch": 0.9813432835820896, | |
| "grad_norm": 0.11555695533752441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584969520568848, | |
| "mean_token_accuracy": 0.7722631692886353, | |
| "num_tokens": 4300006.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5427989065647125, | |
| "epoch": 0.9850746268656716, | |
| "grad_norm": 0.11381992697715759, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401906967163086, | |
| "mean_token_accuracy": 0.7819131314754486, | |
| "num_tokens": 4316285.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5244657546281815, | |
| "epoch": 0.9888059701492538, | |
| "grad_norm": 0.12954184412956238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230352282524109, | |
| "mean_token_accuracy": 0.7875886708498001, | |
| "num_tokens": 4332644.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5411987751722336, | |
| "epoch": 0.9925373134328358, | |
| "grad_norm": 0.12008430063724518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408762097358704, | |
| "mean_token_accuracy": 0.7805971801280975, | |
| "num_tokens": 4349014.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.537212684750557, | |
| "epoch": 0.996268656716418, | |
| "grad_norm": 0.13956718146800995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449704527854919, | |
| "mean_token_accuracy": 0.7769150733947754, | |
| "num_tokens": 4365397.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5412362664937973, | |
| "epoch": 1.0, | |
| "grad_norm": 0.11382853984832764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392265319824219, | |
| "mean_token_accuracy": 0.7833839505910873, | |
| "num_tokens": 4381834.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5496137291193008, | |
| "epoch": 1.0037313432835822, | |
| "grad_norm": 0.14231012761592865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489864945411682, | |
| "mean_token_accuracy": 0.7766753733158112, | |
| "num_tokens": 4398074.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5562388151884079, | |
| "epoch": 1.007462686567164, | |
| "grad_norm": 0.14497025310993195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603899359703064, | |
| "mean_token_accuracy": 0.7735977172851562, | |
| "num_tokens": 4414424.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5293630063533783, | |
| "epoch": 1.0111940298507462, | |
| "grad_norm": 0.12251973897218704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230416059494019, | |
| "mean_token_accuracy": 0.7859042882919312, | |
| "num_tokens": 4430738.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5297266095876694, | |
| "epoch": 1.0149253731343284, | |
| "grad_norm": 0.12865795195102692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318350195884705, | |
| "mean_token_accuracy": 0.7831861972808838, | |
| "num_tokens": 4446854.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5223220437765121, | |
| "epoch": 1.0186567164179103, | |
| "grad_norm": 0.1494293063879013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327814221382141, | |
| "mean_token_accuracy": 0.7832103371620178, | |
| "num_tokens": 4463067.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5113897025585175, | |
| "epoch": 1.0223880597014925, | |
| "grad_norm": 0.11985855549573898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5085136890411377, | |
| "mean_token_accuracy": 0.7943005859851837, | |
| "num_tokens": 4479208.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5331714898347855, | |
| "epoch": 1.0261194029850746, | |
| "grad_norm": 0.11615335196256638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315767526626587, | |
| "mean_token_accuracy": 0.7823154479265213, | |
| "num_tokens": 4495400.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5418258756399155, | |
| "epoch": 1.0298507462686568, | |
| "grad_norm": 0.12503200769424438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371681451797485, | |
| "mean_token_accuracy": 0.7810330092906952, | |
| "num_tokens": 4511712.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5291843414306641, | |
| "epoch": 1.0335820895522387, | |
| "grad_norm": 0.12552055716514587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229098796844482, | |
| "mean_token_accuracy": 0.7861831933259964, | |
| "num_tokens": 4527757.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5402754694223404, | |
| "epoch": 1.037313432835821, | |
| "grad_norm": 0.12993621826171875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389543771743774, | |
| "mean_token_accuracy": 0.782686859369278, | |
| "num_tokens": 4544172.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5249762684106827, | |
| "epoch": 1.041044776119403, | |
| "grad_norm": 0.1478368192911148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288144946098328, | |
| "mean_token_accuracy": 0.7870309799909592, | |
| "num_tokens": 4560317.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5261744558811188, | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 0.12392111867666245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337116122245789, | |
| "mean_token_accuracy": 0.7859398722648621, | |
| "num_tokens": 4576552.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5196933448314667, | |
| "epoch": 1.0485074626865671, | |
| "grad_norm": 0.13088668882846832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231020450592041, | |
| "mean_token_accuracy": 0.7892478257417679, | |
| "num_tokens": 4592581.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.530863881111145, | |
| "epoch": 1.0522388059701493, | |
| "grad_norm": 0.12411776930093765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521477460861206, | |
| "mean_token_accuracy": 0.7883302420377731, | |
| "num_tokens": 4609148.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5147035792469978, | |
| "epoch": 1.0559701492537314, | |
| "grad_norm": 0.11664963513612747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5152803063392639, | |
| "mean_token_accuracy": 0.7897714674472809, | |
| "num_tokens": 4625339.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5241324007511139, | |
| "epoch": 1.0597014925373134, | |
| "grad_norm": 0.12206321954727173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279011726379395, | |
| "mean_token_accuracy": 0.7872984111309052, | |
| "num_tokens": 4641602.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5386586785316467, | |
| "epoch": 1.0634328358208955, | |
| "grad_norm": 0.15844044089317322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462183356285095, | |
| "mean_token_accuracy": 0.7776554077863693, | |
| "num_tokens": 4657935.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5212236195802689, | |
| "epoch": 1.0671641791044777, | |
| "grad_norm": 0.12227971851825714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524368941783905, | |
| "mean_token_accuracy": 0.7889244109392166, | |
| "num_tokens": 4674375.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5298297703266144, | |
| "epoch": 1.0708955223880596, | |
| "grad_norm": 0.11141645163297653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300790667533875, | |
| "mean_token_accuracy": 0.7826484590768814, | |
| "num_tokens": 4690771.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5472451746463776, | |
| "epoch": 1.0746268656716418, | |
| "grad_norm": 0.12320703268051147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423391461372375, | |
| "mean_token_accuracy": 0.780271515250206, | |
| "num_tokens": 4707429.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5120319426059723, | |
| "epoch": 1.078358208955224, | |
| "grad_norm": 0.12205273658037186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5049785375595093, | |
| "mean_token_accuracy": 0.7964775711297989, | |
| "num_tokens": 4723707.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5228906571865082, | |
| "epoch": 1.0820895522388059, | |
| "grad_norm": 0.14154046773910522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207747220993042, | |
| "mean_token_accuracy": 0.7898598164319992, | |
| "num_tokens": 4739904.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.522852934896946, | |
| "epoch": 1.085820895522388, | |
| "grad_norm": 0.12813158333301544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229586958885193, | |
| "mean_token_accuracy": 0.7879058122634888, | |
| "num_tokens": 4756146.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5175448060035706, | |
| "epoch": 1.0895522388059702, | |
| "grad_norm": 0.1693999171257019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333408713340759, | |
| "mean_token_accuracy": 0.7839324027299881, | |
| "num_tokens": 4772324.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5461927354335785, | |
| "epoch": 1.0932835820895523, | |
| "grad_norm": 0.1190054640173912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545452892780304, | |
| "mean_token_accuracy": 0.7791879326105118, | |
| "num_tokens": 4788838.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5367765128612518, | |
| "epoch": 1.0970149253731343, | |
| "grad_norm": 0.160573810338974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323442816734314, | |
| "mean_token_accuracy": 0.7844058275222778, | |
| "num_tokens": 4805260.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5449754297733307, | |
| "epoch": 1.1007462686567164, | |
| "grad_norm": 0.13656781613826752, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343316793441772, | |
| "mean_token_accuracy": 0.786631390452385, | |
| "num_tokens": 4821651.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.539639413356781, | |
| "epoch": 1.1044776119402986, | |
| "grad_norm": 0.15722377598285675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537823498249054, | |
| "mean_token_accuracy": 0.7838342785835266, | |
| "num_tokens": 4838086.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5071177557110786, | |
| "epoch": 1.1082089552238805, | |
| "grad_norm": 0.13242004811763763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.519379198551178, | |
| "mean_token_accuracy": 0.790022000670433, | |
| "num_tokens": 4854421.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5327034294605255, | |
| "epoch": 1.1119402985074627, | |
| "grad_norm": 0.21717894077301025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451952815055847, | |
| "mean_token_accuracy": 0.7793966829776764, | |
| "num_tokens": 4870862.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5120953842997551, | |
| "epoch": 1.1156716417910448, | |
| "grad_norm": 0.11570360511541367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5137699246406555, | |
| "mean_token_accuracy": 0.7910549491643906, | |
| "num_tokens": 4887047.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5416189283132553, | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 0.15835031867027283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377160310745239, | |
| "mean_token_accuracy": 0.7817842811346054, | |
| "num_tokens": 4903770.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5600537657737732, | |
| "epoch": 1.123134328358209, | |
| "grad_norm": 0.16074593365192413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558266043663025, | |
| "mean_token_accuracy": 0.7756943106651306, | |
| "num_tokens": 4920314.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5424332320690155, | |
| "epoch": 1.126865671641791, | |
| "grad_norm": 0.13547547161579132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412736535072327, | |
| "mean_token_accuracy": 0.7802875488996506, | |
| "num_tokens": 4936795.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5479728579521179, | |
| "epoch": 1.1305970149253732, | |
| "grad_norm": 0.17388752102851868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473156571388245, | |
| "mean_token_accuracy": 0.7779090404510498, | |
| "num_tokens": 4953215.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5354913771152496, | |
| "epoch": 1.1343283582089552, | |
| "grad_norm": 0.12070244550704956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346955060958862, | |
| "mean_token_accuracy": 0.7821491658687592, | |
| "num_tokens": 4969473.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5357395708560944, | |
| "epoch": 1.1380597014925373, | |
| "grad_norm": 0.1695796698331833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382478833198547, | |
| "mean_token_accuracy": 0.7825665175914764, | |
| "num_tokens": 4985892.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5406463518738747, | |
| "epoch": 1.1417910447761195, | |
| "grad_norm": 0.13278549909591675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439954996109009, | |
| "mean_token_accuracy": 0.781127467751503, | |
| "num_tokens": 5002244.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5423679053783417, | |
| "epoch": 1.1455223880597014, | |
| "grad_norm": 0.1525002419948578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506120324134827, | |
| "mean_token_accuracy": 0.7751760631799698, | |
| "num_tokens": 5018518.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5409325361251831, | |
| "epoch": 1.1492537313432836, | |
| "grad_norm": 0.1641884595155716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398315787315369, | |
| "mean_token_accuracy": 0.7811702787876129, | |
| "num_tokens": 5034880.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.527726948261261, | |
| "epoch": 1.1529850746268657, | |
| "grad_norm": 0.13098926842212677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239942669868469, | |
| "mean_token_accuracy": 0.7863958179950714, | |
| "num_tokens": 5051492.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5603475868701935, | |
| "epoch": 1.1567164179104479, | |
| "grad_norm": 0.17059364914894104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537184476852417, | |
| "mean_token_accuracy": 0.7751886546611786, | |
| "num_tokens": 5067902.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.522188276052475, | |
| "epoch": 1.1604477611940298, | |
| "grad_norm": 0.14454245567321777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286940932273865, | |
| "mean_token_accuracy": 0.7850693166255951, | |
| "num_tokens": 5084221.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5343948155641556, | |
| "epoch": 1.164179104477612, | |
| "grad_norm": 0.13227348029613495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384489297866821, | |
| "mean_token_accuracy": 0.7807275205850601, | |
| "num_tokens": 5100663.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5275873988866806, | |
| "epoch": 1.1679104477611941, | |
| "grad_norm": 0.1753464788198471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382294058799744, | |
| "mean_token_accuracy": 0.7828755527734756, | |
| "num_tokens": 5117302.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5497360378503799, | |
| "epoch": 1.171641791044776, | |
| "grad_norm": 0.13286371529102325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496618151664734, | |
| "mean_token_accuracy": 0.7774941623210907, | |
| "num_tokens": 5133769.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.532920241355896, | |
| "epoch": 1.1753731343283582, | |
| "grad_norm": 0.15036581456661224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5245468020439148, | |
| "mean_token_accuracy": 0.7888032495975494, | |
| "num_tokens": 5150119.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5440064817667007, | |
| "epoch": 1.1791044776119404, | |
| "grad_norm": 0.13510671257972717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358728170394897, | |
| "mean_token_accuracy": 0.7828054130077362, | |
| "num_tokens": 5166721.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5312670171260834, | |
| "epoch": 1.1828358208955223, | |
| "grad_norm": 0.11371396481990814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337090492248535, | |
| "mean_token_accuracy": 0.7806256115436554, | |
| "num_tokens": 5182960.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5359569638967514, | |
| "epoch": 1.1865671641791045, | |
| "grad_norm": 0.1442011594772339, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444678068161011, | |
| "mean_token_accuracy": 0.7807507514953613, | |
| "num_tokens": 5199188.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5328075140714645, | |
| "epoch": 1.1902985074626866, | |
| "grad_norm": 0.14832444489002228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382975339889526, | |
| "mean_token_accuracy": 0.7805762439966202, | |
| "num_tokens": 5215650.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5216325521469116, | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 0.14424221217632294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5250576734542847, | |
| "mean_token_accuracy": 0.7859031856060028, | |
| "num_tokens": 5231820.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5351075977087021, | |
| "epoch": 1.1977611940298507, | |
| "grad_norm": 0.14221367239952087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295757055282593, | |
| "mean_token_accuracy": 0.7862369567155838, | |
| "num_tokens": 5248279.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5397693365812302, | |
| "epoch": 1.2014925373134329, | |
| "grad_norm": 0.13292263448238373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341707468032837, | |
| "mean_token_accuracy": 0.7843815088272095, | |
| "num_tokens": 5264712.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5192128270864487, | |
| "epoch": 1.205223880597015, | |
| "grad_norm": 0.14713309705257416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247495770454407, | |
| "mean_token_accuracy": 0.7879969924688339, | |
| "num_tokens": 5280975.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.542580246925354, | |
| "epoch": 1.208955223880597, | |
| "grad_norm": 0.1425526738166809, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457293391227722, | |
| "mean_token_accuracy": 0.7779300808906555, | |
| "num_tokens": 5297373.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.51340202242136, | |
| "epoch": 1.212686567164179, | |
| "grad_norm": 0.13574931025505066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158831477165222, | |
| "mean_token_accuracy": 0.7899662852287292, | |
| "num_tokens": 5313524.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5239507853984833, | |
| "epoch": 1.2164179104477613, | |
| "grad_norm": 0.1242108941078186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264536142349243, | |
| "mean_token_accuracy": 0.7876432240009308, | |
| "num_tokens": 5330035.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5461296737194061, | |
| "epoch": 1.2201492537313432, | |
| "grad_norm": 0.13526761531829834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456458330154419, | |
| "mean_token_accuracy": 0.7787662595510483, | |
| "num_tokens": 5346713.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5285127460956573, | |
| "epoch": 1.2238805970149254, | |
| "grad_norm": 0.1288863569498062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286239385604858, | |
| "mean_token_accuracy": 0.7839469760656357, | |
| "num_tokens": 5362892.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5281976014375687, | |
| "epoch": 1.2276119402985075, | |
| "grad_norm": 0.15830843150615692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338830351829529, | |
| "mean_token_accuracy": 0.7864977121353149, | |
| "num_tokens": 5379105.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.537989154458046, | |
| "epoch": 1.2313432835820897, | |
| "grad_norm": 0.14264224469661713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378222465515137, | |
| "mean_token_accuracy": 0.7845461368560791, | |
| "num_tokens": 5395557.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5446864664554596, | |
| "epoch": 1.2350746268656716, | |
| "grad_norm": 0.15385743975639343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452708005905151, | |
| "mean_token_accuracy": 0.7787858992815018, | |
| "num_tokens": 5411870.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5162093490362167, | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.13330549001693726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5179134607315063, | |
| "mean_token_accuracy": 0.7886767089366913, | |
| "num_tokens": 5428174.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5166965126991272, | |
| "epoch": 1.242537313432836, | |
| "grad_norm": 0.13044792413711548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5149925947189331, | |
| "mean_token_accuracy": 0.7877358198165894, | |
| "num_tokens": 5444504.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5293487906455994, | |
| "epoch": 1.2462686567164178, | |
| "grad_norm": 0.15583756566047668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320658087730408, | |
| "mean_token_accuracy": 0.7861583828926086, | |
| "num_tokens": 5460813.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5320923030376434, | |
| "epoch": 1.25, | |
| "grad_norm": 0.12959426641464233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345736145973206, | |
| "mean_token_accuracy": 0.7825423777103424, | |
| "num_tokens": 5477333.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5326530635356903, | |
| "epoch": 1.2537313432835822, | |
| "grad_norm": 0.15951137244701385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311124920845032, | |
| "mean_token_accuracy": 0.7841883301734924, | |
| "num_tokens": 5493735.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.544501468539238, | |
| "epoch": 1.2574626865671643, | |
| "grad_norm": 0.12288819998502731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451238751411438, | |
| "mean_token_accuracy": 0.7775899171829224, | |
| "num_tokens": 5510068.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5330418646335602, | |
| "epoch": 1.2611940298507462, | |
| "grad_norm": 0.13410672545433044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535346269607544, | |
| "mean_token_accuracy": 0.7835884392261505, | |
| "num_tokens": 5526452.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5434266775846481, | |
| "epoch": 1.2649253731343284, | |
| "grad_norm": 0.13076815009117126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440234541893005, | |
| "mean_token_accuracy": 0.7821687757968903, | |
| "num_tokens": 5542951.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5151484906673431, | |
| "epoch": 1.2686567164179103, | |
| "grad_norm": 0.12828661501407623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5160608887672424, | |
| "mean_token_accuracy": 0.791755273938179, | |
| "num_tokens": 5559086.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5275644734501839, | |
| "epoch": 1.2723880597014925, | |
| "grad_norm": 0.13408422470092773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317025184631348, | |
| "mean_token_accuracy": 0.7861050963401794, | |
| "num_tokens": 5575521.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5177630484104156, | |
| "epoch": 1.2761194029850746, | |
| "grad_norm": 0.12419670075178146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5191144347190857, | |
| "mean_token_accuracy": 0.7892575412988663, | |
| "num_tokens": 5591947.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5407169461250305, | |
| "epoch": 1.2798507462686568, | |
| "grad_norm": 0.1364241987466812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430530309677124, | |
| "mean_token_accuracy": 0.779339611530304, | |
| "num_tokens": 5608447.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5262736082077026, | |
| "epoch": 1.2835820895522387, | |
| "grad_norm": 0.15587468445301056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301055312156677, | |
| "mean_token_accuracy": 0.7836160659790039, | |
| "num_tokens": 5625044.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5458462238311768, | |
| "epoch": 1.287313432835821, | |
| "grad_norm": 0.13173708319664001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517262816429138, | |
| "mean_token_accuracy": 0.7764803022146225, | |
| "num_tokens": 5641335.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5216450393199921, | |
| "epoch": 1.291044776119403, | |
| "grad_norm": 0.17484262585639954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218112468719482, | |
| "mean_token_accuracy": 0.7843209207057953, | |
| "num_tokens": 5657347.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5498285889625549, | |
| "epoch": 1.294776119402985, | |
| "grad_norm": 0.12871748208999634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382349491119385, | |
| "mean_token_accuracy": 0.7812492400407791, | |
| "num_tokens": 5673588.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5317611545324326, | |
| "epoch": 1.2985074626865671, | |
| "grad_norm": 0.15342608094215393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276378989219666, | |
| "mean_token_accuracy": 0.7836941033601761, | |
| "num_tokens": 5689687.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5218729674816132, | |
| "epoch": 1.3022388059701493, | |
| "grad_norm": 0.1535658985376358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265159606933594, | |
| "mean_token_accuracy": 0.7863410115242004, | |
| "num_tokens": 5705883.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5283405184745789, | |
| "epoch": 1.3059701492537314, | |
| "grad_norm": 0.1400662213563919, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348565578460693, | |
| "mean_token_accuracy": 0.7835897505283356, | |
| "num_tokens": 5722396.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5465448051691055, | |
| "epoch": 1.3097014925373134, | |
| "grad_norm": 0.1789598912000656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508973002433777, | |
| "mean_token_accuracy": 0.7770535051822662, | |
| "num_tokens": 5738946.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5288202613592148, | |
| "epoch": 1.3134328358208955, | |
| "grad_norm": 0.12526051700115204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298986434936523, | |
| "mean_token_accuracy": 0.7855530083179474, | |
| "num_tokens": 5755207.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5429712533950806, | |
| "epoch": 1.3171641791044777, | |
| "grad_norm": 0.12195583432912827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387951731681824, | |
| "mean_token_accuracy": 0.7802612334489822, | |
| "num_tokens": 5771582.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5358787178993225, | |
| "epoch": 1.3208955223880596, | |
| "grad_norm": 0.15126559138298035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349993705749512, | |
| "mean_token_accuracy": 0.7822433114051819, | |
| "num_tokens": 5787967.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5424338132143021, | |
| "epoch": 1.3246268656716418, | |
| "grad_norm": 0.1308310180902481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434916615486145, | |
| "mean_token_accuracy": 0.7826928794384003, | |
| "num_tokens": 5804528.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5337295234203339, | |
| "epoch": 1.328358208955224, | |
| "grad_norm": 0.16843028366565704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465773344039917, | |
| "mean_token_accuracy": 0.777764692902565, | |
| "num_tokens": 5820684.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.504702128469944, | |
| "epoch": 1.332089552238806, | |
| "grad_norm": 0.1529076248407364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5113453269004822, | |
| "mean_token_accuracy": 0.791937530040741, | |
| "num_tokens": 5836988.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.536053940653801, | |
| "epoch": 1.335820895522388, | |
| "grad_norm": 0.1379069983959198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389484763145447, | |
| "mean_token_accuracy": 0.7813952714204788, | |
| "num_tokens": 5853542.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5438119322061539, | |
| "epoch": 1.3395522388059702, | |
| "grad_norm": 0.12008243054151535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360631346702576, | |
| "mean_token_accuracy": 0.7817373275756836, | |
| "num_tokens": 5870213.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.550885871052742, | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 0.13378706574440002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54970383644104, | |
| "mean_token_accuracy": 0.7768265455961227, | |
| "num_tokens": 5886513.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5400225073099136, | |
| "epoch": 1.3470149253731343, | |
| "grad_norm": 0.13530388474464417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343542098999023, | |
| "mean_token_accuracy": 0.782709077000618, | |
| "num_tokens": 5903049.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5389147847890854, | |
| "epoch": 1.3507462686567164, | |
| "grad_norm": 0.12446677684783936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388710498809814, | |
| "mean_token_accuracy": 0.781377524137497, | |
| "num_tokens": 5919403.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.537296935915947, | |
| "epoch": 1.3544776119402986, | |
| "grad_norm": 0.13781245052814484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438515543937683, | |
| "mean_token_accuracy": 0.7785618007183075, | |
| "num_tokens": 5935511.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5429168194532394, | |
| "epoch": 1.3582089552238805, | |
| "grad_norm": 0.13629309833049774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453547239303589, | |
| "mean_token_accuracy": 0.7784431874752045, | |
| "num_tokens": 5951972.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5427183359861374, | |
| "epoch": 1.3619402985074627, | |
| "grad_norm": 0.1370571255683899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545956552028656, | |
| "mean_token_accuracy": 0.7787607759237289, | |
| "num_tokens": 5968229.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5378859043121338, | |
| "epoch": 1.3656716417910448, | |
| "grad_norm": 0.12471959739923477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353823900222778, | |
| "mean_token_accuracy": 0.7809005975723267, | |
| "num_tokens": 5984669.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5365873426198959, | |
| "epoch": 1.3694029850746268, | |
| "grad_norm": 0.16501657664775848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319327712059021, | |
| "mean_token_accuracy": 0.7824555039405823, | |
| "num_tokens": 6001027.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5265276953577995, | |
| "epoch": 1.373134328358209, | |
| "grad_norm": 0.12363235652446747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210375785827637, | |
| "mean_token_accuracy": 0.7883688807487488, | |
| "num_tokens": 6017125.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5277390778064728, | |
| "epoch": 1.376865671641791, | |
| "grad_norm": 0.1423310935497284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316471457481384, | |
| "mean_token_accuracy": 0.7828662693500519, | |
| "num_tokens": 6033508.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5263610854744911, | |
| "epoch": 1.3805970149253732, | |
| "grad_norm": 0.1381843090057373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311442613601685, | |
| "mean_token_accuracy": 0.7821517586708069, | |
| "num_tokens": 6049886.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5286078453063965, | |
| "epoch": 1.3843283582089552, | |
| "grad_norm": 0.18003322184085846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398144721984863, | |
| "mean_token_accuracy": 0.7803981304168701, | |
| "num_tokens": 6066120.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5356258824467659, | |
| "epoch": 1.3880597014925373, | |
| "grad_norm": 0.11802922934293747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53504878282547, | |
| "mean_token_accuracy": 0.7814585119485855, | |
| "num_tokens": 6082732.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5387788712978363, | |
| "epoch": 1.3917910447761195, | |
| "grad_norm": 0.13874171674251556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358333587646484, | |
| "mean_token_accuracy": 0.7825580388307571, | |
| "num_tokens": 6099018.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5342960059642792, | |
| "epoch": 1.3955223880597014, | |
| "grad_norm": 0.1402461677789688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348989963531494, | |
| "mean_token_accuracy": 0.7847650349140167, | |
| "num_tokens": 6115279.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5361053943634033, | |
| "epoch": 1.3992537313432836, | |
| "grad_norm": 0.11853493005037308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328879356384277, | |
| "mean_token_accuracy": 0.7853472977876663, | |
| "num_tokens": 6131854.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5300562530755997, | |
| "epoch": 1.4029850746268657, | |
| "grad_norm": 0.1642550826072693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330582857131958, | |
| "mean_token_accuracy": 0.7824369519948959, | |
| "num_tokens": 6148329.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5351111143827438, | |
| "epoch": 1.4067164179104479, | |
| "grad_norm": 0.13296250998973846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308345556259155, | |
| "mean_token_accuracy": 0.7840287983417511, | |
| "num_tokens": 6164520.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.549595445394516, | |
| "epoch": 1.4104477611940298, | |
| "grad_norm": 0.11937810480594635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439208745956421, | |
| "mean_token_accuracy": 0.7801520526409149, | |
| "num_tokens": 6180840.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5249980017542839, | |
| "epoch": 1.414179104477612, | |
| "grad_norm": 0.14947783946990967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214130878448486, | |
| "mean_token_accuracy": 0.7883247882127762, | |
| "num_tokens": 6197072.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5341014862060547, | |
| "epoch": 1.417910447761194, | |
| "grad_norm": 0.14708726108074188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437160730361938, | |
| "mean_token_accuracy": 0.7790101766586304, | |
| "num_tokens": 6213410.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5305748581886292, | |
| "epoch": 1.421641791044776, | |
| "grad_norm": 0.15660500526428223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538860559463501, | |
| "mean_token_accuracy": 0.7808915078639984, | |
| "num_tokens": 6229812.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5335244983434677, | |
| "epoch": 1.4253731343283582, | |
| "grad_norm": 0.14013393223285675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405108332633972, | |
| "mean_token_accuracy": 0.7806441932916641, | |
| "num_tokens": 6246122.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5370550155639648, | |
| "epoch": 1.4291044776119404, | |
| "grad_norm": 0.15498457849025726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275038480758667, | |
| "mean_token_accuracy": 0.7845180481672287, | |
| "num_tokens": 6262400.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5388240739703178, | |
| "epoch": 1.4328358208955223, | |
| "grad_norm": 0.13547126948833466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339113473892212, | |
| "mean_token_accuracy": 0.7817906439304352, | |
| "num_tokens": 6278433.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5327373743057251, | |
| "epoch": 1.4365671641791045, | |
| "grad_norm": 0.15488973259925842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536837637424469, | |
| "mean_token_accuracy": 0.7805320471525192, | |
| "num_tokens": 6294780.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5164054483175278, | |
| "epoch": 1.4402985074626866, | |
| "grad_norm": 0.13659167289733887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196657180786133, | |
| "mean_token_accuracy": 0.7893420159816742, | |
| "num_tokens": 6310926.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5441898256540298, | |
| "epoch": 1.4440298507462686, | |
| "grad_norm": 0.30239349603652954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498929023742676, | |
| "mean_token_accuracy": 0.7768156677484512, | |
| "num_tokens": 6327465.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5278986096382141, | |
| "epoch": 1.4477611940298507, | |
| "grad_norm": 0.16996067762374878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285515785217285, | |
| "mean_token_accuracy": 0.786761000752449, | |
| "num_tokens": 6343503.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.508112832903862, | |
| "epoch": 1.4514925373134329, | |
| "grad_norm": 0.14852264523506165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5129667520523071, | |
| "mean_token_accuracy": 0.7919276505708694, | |
| "num_tokens": 6359667.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5249242335557938, | |
| "epoch": 1.455223880597015, | |
| "grad_norm": 0.17182905972003937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207914113998413, | |
| "mean_token_accuracy": 0.7878070920705795, | |
| "num_tokens": 6376114.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5415022522211075, | |
| "epoch": 1.458955223880597, | |
| "grad_norm": 0.14497698843479156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450653433799744, | |
| "mean_token_accuracy": 0.7796677798032761, | |
| "num_tokens": 6392417.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5454135686159134, | |
| "epoch": 1.462686567164179, | |
| "grad_norm": 0.14885719120502472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476389527320862, | |
| "mean_token_accuracy": 0.7781424224376678, | |
| "num_tokens": 6408701.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5305422842502594, | |
| "epoch": 1.4664179104477613, | |
| "grad_norm": 0.13111279904842377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283982753753662, | |
| "mean_token_accuracy": 0.786282405257225, | |
| "num_tokens": 6425186.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.519924134016037, | |
| "epoch": 1.4701492537313432, | |
| "grad_norm": 0.15385456383228302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183860659599304, | |
| "mean_token_accuracy": 0.7890526354312897, | |
| "num_tokens": 6441474.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5419893115758896, | |
| "epoch": 1.4738805970149254, | |
| "grad_norm": 0.12959027290344238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391095876693726, | |
| "mean_token_accuracy": 0.7845679074525833, | |
| "num_tokens": 6458137.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5297622233629227, | |
| "epoch": 1.4776119402985075, | |
| "grad_norm": 0.12876980006694794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316991209983826, | |
| "mean_token_accuracy": 0.783607617020607, | |
| "num_tokens": 6474605.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5133326500654221, | |
| "epoch": 1.4813432835820897, | |
| "grad_norm": 0.23840782046318054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223475098609924, | |
| "mean_token_accuracy": 0.7896056026220322, | |
| "num_tokens": 6490747.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.540631890296936, | |
| "epoch": 1.4850746268656716, | |
| "grad_norm": 0.18176521360874176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429366230964661, | |
| "mean_token_accuracy": 0.7787415534257889, | |
| "num_tokens": 6507149.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5534960627555847, | |
| "epoch": 1.4888059701492538, | |
| "grad_norm": 0.38266992568969727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652564764022827, | |
| "mean_token_accuracy": 0.7736776769161224, | |
| "num_tokens": 6523502.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5438710153102875, | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 0.15845677256584167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439051985740662, | |
| "mean_token_accuracy": 0.7816531956195831, | |
| "num_tokens": 6539815.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5452860891819, | |
| "epoch": 1.4962686567164178, | |
| "grad_norm": 0.19755159318447113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404053926467896, | |
| "mean_token_accuracy": 0.7815948128700256, | |
| "num_tokens": 6555976.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5241969153285027, | |
| "epoch": 1.5, | |
| "grad_norm": 0.14966075122356415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5205419063568115, | |
| "mean_token_accuracy": 0.7888282835483551, | |
| "num_tokens": 6572116.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5179315954446793, | |
| "epoch": 1.5037313432835822, | |
| "grad_norm": 0.15208128094673157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5195380449295044, | |
| "mean_token_accuracy": 0.7901398837566376, | |
| "num_tokens": 6588360.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5443613976240158, | |
| "epoch": 1.5074626865671643, | |
| "grad_norm": 0.15764807164669037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409551858901978, | |
| "mean_token_accuracy": 0.7817244678735733, | |
| "num_tokens": 6604909.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5555933266878128, | |
| "epoch": 1.5111940298507462, | |
| "grad_norm": 0.15518265962600708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575823187828064, | |
| "mean_token_accuracy": 0.7727370858192444, | |
| "num_tokens": 6621271.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5448516458272934, | |
| "epoch": 1.5149253731343284, | |
| "grad_norm": 0.13999900221824646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443175435066223, | |
| "mean_token_accuracy": 0.7797447293996811, | |
| "num_tokens": 6637394.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5633855164051056, | |
| "epoch": 1.5186567164179103, | |
| "grad_norm": 0.12512464821338654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552009344100952, | |
| "mean_token_accuracy": 0.7740202099084854, | |
| "num_tokens": 6653670.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5442499816417694, | |
| "epoch": 1.5223880597014925, | |
| "grad_norm": 0.13073165714740753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353500843048096, | |
| "mean_token_accuracy": 0.7859338223934174, | |
| "num_tokens": 6670329.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5133479535579681, | |
| "epoch": 1.5261194029850746, | |
| "grad_norm": 0.1424253284931183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181159377098083, | |
| "mean_token_accuracy": 0.791978657245636, | |
| "num_tokens": 6686590.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5216629430651665, | |
| "epoch": 1.5298507462686568, | |
| "grad_norm": 0.15952785313129425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411725640296936, | |
| "mean_token_accuracy": 0.7812029272317886, | |
| "num_tokens": 6702970.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5392735451459885, | |
| "epoch": 1.533582089552239, | |
| "grad_norm": 0.13047060370445251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485432147979736, | |
| "mean_token_accuracy": 0.7774497866630554, | |
| "num_tokens": 6719627.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5269859135150909, | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.13100764155387878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288376212120056, | |
| "mean_token_accuracy": 0.7857958972454071, | |
| "num_tokens": 6735951.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.546154260635376, | |
| "epoch": 1.5410447761194028, | |
| "grad_norm": 0.13160941004753113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382481813430786, | |
| "mean_token_accuracy": 0.7786583751440048, | |
| "num_tokens": 6752564.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.552439495921135, | |
| "epoch": 1.544776119402985, | |
| "grad_norm": 0.13911442458629608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381487011909485, | |
| "mean_token_accuracy": 0.782607913017273, | |
| "num_tokens": 6768993.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5463637262582779, | |
| "epoch": 1.5485074626865671, | |
| "grad_norm": 0.12377088516950607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482580661773682, | |
| "mean_token_accuracy": 0.7775403410196304, | |
| "num_tokens": 6785304.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5309856235980988, | |
| "epoch": 1.5522388059701493, | |
| "grad_norm": 0.14743956923484802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372556447982788, | |
| "mean_token_accuracy": 0.7811425626277924, | |
| "num_tokens": 6801545.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5256488621234894, | |
| "epoch": 1.5559701492537314, | |
| "grad_norm": 0.13745813071727753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335954427719116, | |
| "mean_token_accuracy": 0.7857853770256042, | |
| "num_tokens": 6817793.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5426470339298248, | |
| "epoch": 1.5597014925373134, | |
| "grad_norm": 0.15729817748069763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557945966720581, | |
| "mean_token_accuracy": 0.7755606323480606, | |
| "num_tokens": 6834171.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5429180264472961, | |
| "epoch": 1.5634328358208955, | |
| "grad_norm": 0.1530143916606903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445144176483154, | |
| "mean_token_accuracy": 0.7793177515268326, | |
| "num_tokens": 6850298.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5458863228559494, | |
| "epoch": 1.5671641791044775, | |
| "grad_norm": 0.1244051530957222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383530855178833, | |
| "mean_token_accuracy": 0.7812670916318893, | |
| "num_tokens": 6866891.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.564603790640831, | |
| "epoch": 1.5708955223880596, | |
| "grad_norm": 0.14283782243728638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600205659866333, | |
| "mean_token_accuracy": 0.7725525945425034, | |
| "num_tokens": 6883247.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5389530211687088, | |
| "epoch": 1.5746268656716418, | |
| "grad_norm": 0.13312764465808868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395158529281616, | |
| "mean_token_accuracy": 0.7833812385797501, | |
| "num_tokens": 6899801.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5225178450345993, | |
| "epoch": 1.578358208955224, | |
| "grad_norm": 0.12671785056591034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530681312084198, | |
| "mean_token_accuracy": 0.7860707342624664, | |
| "num_tokens": 6916126.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5225076675415039, | |
| "epoch": 1.582089552238806, | |
| "grad_norm": 0.1846325844526291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287823677062988, | |
| "mean_token_accuracy": 0.7858179211616516, | |
| "num_tokens": 6932572.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5322756171226501, | |
| "epoch": 1.585820895522388, | |
| "grad_norm": 0.1279527246952057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314757823944092, | |
| "mean_token_accuracy": 0.7839424312114716, | |
| "num_tokens": 6948915.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5399055480957031, | |
| "epoch": 1.5895522388059702, | |
| "grad_norm": 0.14472827315330505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389757752418518, | |
| "mean_token_accuracy": 0.781254380941391, | |
| "num_tokens": 6965311.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.543253481388092, | |
| "epoch": 1.5932835820895521, | |
| "grad_norm": 0.1291203647851944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542615532875061, | |
| "mean_token_accuracy": 0.7801599353551865, | |
| "num_tokens": 6981751.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5258511453866959, | |
| "epoch": 1.5970149253731343, | |
| "grad_norm": 0.14912551641464233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5212829113006592, | |
| "mean_token_accuracy": 0.7879799157381058, | |
| "num_tokens": 6997999.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5359253436326981, | |
| "epoch": 1.6007462686567164, | |
| "grad_norm": 0.13902713358402252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354318618774414, | |
| "mean_token_accuracy": 0.7819556444883347, | |
| "num_tokens": 7014251.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5399288833141327, | |
| "epoch": 1.6044776119402986, | |
| "grad_norm": 0.15356454253196716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459235906600952, | |
| "mean_token_accuracy": 0.7798596769571304, | |
| "num_tokens": 7030929.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5489939600229263, | |
| "epoch": 1.6082089552238807, | |
| "grad_norm": 0.16724750399589539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488972663879395, | |
| "mean_token_accuracy": 0.7782986462116241, | |
| "num_tokens": 7047344.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5521660596132278, | |
| "epoch": 1.6119402985074627, | |
| "grad_norm": 0.1370435506105423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541171431541443, | |
| "mean_token_accuracy": 0.775096669793129, | |
| "num_tokens": 7063772.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5448116213083267, | |
| "epoch": 1.6156716417910446, | |
| "grad_norm": 0.16458411514759064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444625616073608, | |
| "mean_token_accuracy": 0.7808038741350174, | |
| "num_tokens": 7080008.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5336454659700394, | |
| "epoch": 1.6194029850746268, | |
| "grad_norm": 0.13929054141044617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374733805656433, | |
| "mean_token_accuracy": 0.7845250517129898, | |
| "num_tokens": 7096322.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5490863621234894, | |
| "epoch": 1.623134328358209, | |
| "grad_norm": 0.17425119876861572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510268211364746, | |
| "mean_token_accuracy": 0.7752214223146439, | |
| "num_tokens": 7112627.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5409643575549126, | |
| "epoch": 1.626865671641791, | |
| "grad_norm": 0.1438315510749817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421441197395325, | |
| "mean_token_accuracy": 0.7772217243909836, | |
| "num_tokens": 7128753.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5132558643817902, | |
| "epoch": 1.6305970149253732, | |
| "grad_norm": 0.19491760432720184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172038674354553, | |
| "mean_token_accuracy": 0.7922582030296326, | |
| "num_tokens": 7145005.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5373466610908508, | |
| "epoch": 1.6343283582089554, | |
| "grad_norm": 0.1514309048652649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405304431915283, | |
| "mean_token_accuracy": 0.7827999889850616, | |
| "num_tokens": 7161264.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5462755262851715, | |
| "epoch": 1.6380597014925373, | |
| "grad_norm": 0.1856052726507187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462319254875183, | |
| "mean_token_accuracy": 0.7752426117658615, | |
| "num_tokens": 7177601.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5239088907837868, | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 0.15442201495170593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269871950149536, | |
| "mean_token_accuracy": 0.7881719172000885, | |
| "num_tokens": 7194088.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5473000258207321, | |
| "epoch": 1.6455223880597014, | |
| "grad_norm": 0.1733047217130661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468770861625671, | |
| "mean_token_accuracy": 0.7766072303056717, | |
| "num_tokens": 7210540.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5247174948453903, | |
| "epoch": 1.6492537313432836, | |
| "grad_norm": 0.15060853958129883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271586179733276, | |
| "mean_token_accuracy": 0.7868671417236328, | |
| "num_tokens": 7226800.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5296545326709747, | |
| "epoch": 1.6529850746268657, | |
| "grad_norm": 0.14210547506809235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5233073830604553, | |
| "mean_token_accuracy": 0.7905395030975342, | |
| "num_tokens": 7242933.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5275071337819099, | |
| "epoch": 1.6567164179104479, | |
| "grad_norm": 0.16420303285121918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262512564659119, | |
| "mean_token_accuracy": 0.7878832370042801, | |
| "num_tokens": 7259229.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5286994576454163, | |
| "epoch": 1.6604477611940298, | |
| "grad_norm": 0.16218696534633636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356262922286987, | |
| "mean_token_accuracy": 0.781034916639328, | |
| "num_tokens": 7275629.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5249519050121307, | |
| "epoch": 1.664179104477612, | |
| "grad_norm": 0.13650326430797577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306994915008545, | |
| "mean_token_accuracy": 0.7849638760089874, | |
| "num_tokens": 7291780.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5290274769067764, | |
| "epoch": 1.667910447761194, | |
| "grad_norm": 0.13130812346935272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366880297660828, | |
| "mean_token_accuracy": 0.7813905030488968, | |
| "num_tokens": 7308207.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.549896240234375, | |
| "epoch": 1.671641791044776, | |
| "grad_norm": 0.13799095153808594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542113721370697, | |
| "mean_token_accuracy": 0.7787055224180222, | |
| "num_tokens": 7324630.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5595291256904602, | |
| "epoch": 1.6753731343283582, | |
| "grad_norm": 0.12968024611473083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542213320732117, | |
| "mean_token_accuracy": 0.7749587148427963, | |
| "num_tokens": 7340980.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5328024327754974, | |
| "epoch": 1.6791044776119404, | |
| "grad_norm": 0.15673688054084778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303700566291809, | |
| "mean_token_accuracy": 0.7840248346328735, | |
| "num_tokens": 7357233.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.527419738471508, | |
| "epoch": 1.6828358208955225, | |
| "grad_norm": 0.15271416306495667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339101552963257, | |
| "mean_token_accuracy": 0.7841878533363342, | |
| "num_tokens": 7373557.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5285895839333534, | |
| "epoch": 1.6865671641791045, | |
| "grad_norm": 0.1619284600019455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426527261734009, | |
| "mean_token_accuracy": 0.7801112830638885, | |
| "num_tokens": 7389775.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5160977020859718, | |
| "epoch": 1.6902985074626866, | |
| "grad_norm": 0.14479905366897583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5143705606460571, | |
| "mean_token_accuracy": 0.792098343372345, | |
| "num_tokens": 7406142.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5246409177780151, | |
| "epoch": 1.6940298507462686, | |
| "grad_norm": 0.13829895853996277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241281986236572, | |
| "mean_token_accuracy": 0.7888348549604416, | |
| "num_tokens": 7422123.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.539468988776207, | |
| "epoch": 1.6977611940298507, | |
| "grad_norm": 0.14040212333202362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538709282875061, | |
| "mean_token_accuracy": 0.7814967185258865, | |
| "num_tokens": 7438449.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5327620357275009, | |
| "epoch": 1.7014925373134329, | |
| "grad_norm": 0.13067209720611572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531409740447998, | |
| "mean_token_accuracy": 0.7817434817552567, | |
| "num_tokens": 7454843.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5428982973098755, | |
| "epoch": 1.705223880597015, | |
| "grad_norm": 0.13850897550582886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404822826385498, | |
| "mean_token_accuracy": 0.7804021388292313, | |
| "num_tokens": 7471239.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5262090265750885, | |
| "epoch": 1.7089552238805972, | |
| "grad_norm": 0.1596522480249405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263737440109253, | |
| "mean_token_accuracy": 0.7867833971977234, | |
| "num_tokens": 7487626.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5263974219560623, | |
| "epoch": 1.712686567164179, | |
| "grad_norm": 0.1885124146938324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317808985710144, | |
| "mean_token_accuracy": 0.7860947102308273, | |
| "num_tokens": 7504295.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5409001708030701, | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 0.18569619953632355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502086877822876, | |
| "mean_token_accuracy": 0.7780454903841019, | |
| "num_tokens": 7520700.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5375530123710632, | |
| "epoch": 1.7201492537313432, | |
| "grad_norm": 0.1682044118642807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417311787605286, | |
| "mean_token_accuracy": 0.7811579406261444, | |
| "num_tokens": 7537296.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5499445050954819, | |
| "epoch": 1.7238805970149254, | |
| "grad_norm": 0.13629741966724396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424147248268127, | |
| "mean_token_accuracy": 0.7811519056558609, | |
| "num_tokens": 7553751.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5344928205013275, | |
| "epoch": 1.7276119402985075, | |
| "grad_norm": 0.15897303819656372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359247326850891, | |
| "mean_token_accuracy": 0.7842150777578354, | |
| "num_tokens": 7569929.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5554052591323853, | |
| "epoch": 1.7313432835820897, | |
| "grad_norm": 0.1417708843946457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565856099128723, | |
| "mean_token_accuracy": 0.7738053798675537, | |
| "num_tokens": 7586469.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5416853874921799, | |
| "epoch": 1.7350746268656716, | |
| "grad_norm": 0.13722717761993408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358468294143677, | |
| "mean_token_accuracy": 0.7817960679531097, | |
| "num_tokens": 7602590.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5408632606267929, | |
| "epoch": 1.7388059701492538, | |
| "grad_norm": 0.157133087515831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427348017692566, | |
| "mean_token_accuracy": 0.7806098312139511, | |
| "num_tokens": 7618775.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5247721523046494, | |
| "epoch": 1.7425373134328357, | |
| "grad_norm": 0.14061616361141205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321290493011475, | |
| "mean_token_accuracy": 0.7820450663566589, | |
| "num_tokens": 7635093.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5205557495355606, | |
| "epoch": 1.7462686567164178, | |
| "grad_norm": 0.16123539209365845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302354097366333, | |
| "mean_token_accuracy": 0.7854211032390594, | |
| "num_tokens": 7651685.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5282921940088272, | |
| "epoch": 1.75, | |
| "grad_norm": 0.15153366327285767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328198671340942, | |
| "mean_token_accuracy": 0.7865671813488007, | |
| "num_tokens": 7667959.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5481950640678406, | |
| "epoch": 1.7537313432835822, | |
| "grad_norm": 0.12894481420516968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497183799743652, | |
| "mean_token_accuracy": 0.778036966919899, | |
| "num_tokens": 7684463.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5427480936050415, | |
| "epoch": 1.7574626865671643, | |
| "grad_norm": 0.13647432625293732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53739994764328, | |
| "mean_token_accuracy": 0.7818431705236435, | |
| "num_tokens": 7700823.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5326214283704758, | |
| "epoch": 1.7611940298507462, | |
| "grad_norm": 0.13095979392528534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291880369186401, | |
| "mean_token_accuracy": 0.7828460037708282, | |
| "num_tokens": 7717112.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5443256497383118, | |
| "epoch": 1.7649253731343284, | |
| "grad_norm": 0.15335077047348022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414584875106812, | |
| "mean_token_accuracy": 0.7815631777048111, | |
| "num_tokens": 7733478.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5510082393884659, | |
| "epoch": 1.7686567164179103, | |
| "grad_norm": 0.12999047338962555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502053499221802, | |
| "mean_token_accuracy": 0.7768876850605011, | |
| "num_tokens": 7749733.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5287549048662186, | |
| "epoch": 1.7723880597014925, | |
| "grad_norm": 0.14021116495132446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351616740226746, | |
| "mean_token_accuracy": 0.7807136327028275, | |
| "num_tokens": 7766232.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5237460732460022, | |
| "epoch": 1.7761194029850746, | |
| "grad_norm": 0.13716712594032288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256913900375366, | |
| "mean_token_accuracy": 0.7864228338003159, | |
| "num_tokens": 7782399.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5354984253644943, | |
| "epoch": 1.7798507462686568, | |
| "grad_norm": 0.1459989845752716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544316291809082, | |
| "mean_token_accuracy": 0.7797362506389618, | |
| "num_tokens": 7798787.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5540675520896912, | |
| "epoch": 1.783582089552239, | |
| "grad_norm": 0.12925799190998077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467855334281921, | |
| "mean_token_accuracy": 0.7801081091165543, | |
| "num_tokens": 7815176.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5433181077241898, | |
| "epoch": 1.787313432835821, | |
| "grad_norm": 0.14298273622989655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409605503082275, | |
| "mean_token_accuracy": 0.7799843400716782, | |
| "num_tokens": 7831722.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5389926880598068, | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 0.13404588401317596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434566736221313, | |
| "mean_token_accuracy": 0.7795996069908142, | |
| "num_tokens": 7847789.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5389460772275925, | |
| "epoch": 1.794776119402985, | |
| "grad_norm": 0.14891406893730164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478703379631042, | |
| "mean_token_accuracy": 0.7784013152122498, | |
| "num_tokens": 7864039.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5258179157972336, | |
| "epoch": 1.7985074626865671, | |
| "grad_norm": 0.1405036896467209, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313145518302917, | |
| "mean_token_accuracy": 0.7844405174255371, | |
| "num_tokens": 7880600.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.546451672911644, | |
| "epoch": 1.8022388059701493, | |
| "grad_norm": 0.12789376080036163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392628908157349, | |
| "mean_token_accuracy": 0.780185878276825, | |
| "num_tokens": 7897113.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5462568253278732, | |
| "epoch": 1.8059701492537314, | |
| "grad_norm": 0.15970084071159363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421247482299805, | |
| "mean_token_accuracy": 0.7790002077817917, | |
| "num_tokens": 7913715.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5245223939418793, | |
| "epoch": 1.8097014925373134, | |
| "grad_norm": 0.12480644881725311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243803858757019, | |
| "mean_token_accuracy": 0.7877090722322464, | |
| "num_tokens": 7930253.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5543881952762604, | |
| "epoch": 1.8134328358208955, | |
| "grad_norm": 0.17440125346183777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611490607261658, | |
| "mean_token_accuracy": 0.773423507809639, | |
| "num_tokens": 7946773.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5448231846094131, | |
| "epoch": 1.8171641791044775, | |
| "grad_norm": 0.1254844069480896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429874658584595, | |
| "mean_token_accuracy": 0.7773167043924332, | |
| "num_tokens": 7963214.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5249373018741608, | |
| "epoch": 1.8208955223880596, | |
| "grad_norm": 0.13412347435951233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265883803367615, | |
| "mean_token_accuracy": 0.7875321507453918, | |
| "num_tokens": 7979611.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5382010042667389, | |
| "epoch": 1.8246268656716418, | |
| "grad_norm": 0.16182008385658264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412148237228394, | |
| "mean_token_accuracy": 0.7788311392068863, | |
| "num_tokens": 7996094.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5332826524972916, | |
| "epoch": 1.828358208955224, | |
| "grad_norm": 0.1427432894706726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368761420249939, | |
| "mean_token_accuracy": 0.7825220227241516, | |
| "num_tokens": 8012432.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5108669325709343, | |
| "epoch": 1.832089552238806, | |
| "grad_norm": 0.1509285867214203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5119490623474121, | |
| "mean_token_accuracy": 0.7906075417995453, | |
| "num_tokens": 8028665.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5145807713270187, | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 0.1396896094083786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196783542633057, | |
| "mean_token_accuracy": 0.7873106449842453, | |
| "num_tokens": 8044855.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5123258233070374, | |
| "epoch": 1.8395522388059702, | |
| "grad_norm": 0.14697767794132233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223352909088135, | |
| "mean_token_accuracy": 0.7885845303535461, | |
| "num_tokens": 8061121.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5335386842489243, | |
| "epoch": 1.8432835820895521, | |
| "grad_norm": 0.14804190397262573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534782350063324, | |
| "mean_token_accuracy": 0.7838051915168762, | |
| "num_tokens": 8077519.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5602670460939407, | |
| "epoch": 1.8470149253731343, | |
| "grad_norm": 0.13603031635284424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542025566101074, | |
| "mean_token_accuracy": 0.7756092548370361, | |
| "num_tokens": 8093937.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5355454534292221, | |
| "epoch": 1.8507462686567164, | |
| "grad_norm": 0.11670524626970291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269724130630493, | |
| "mean_token_accuracy": 0.7864131927490234, | |
| "num_tokens": 8110383.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5373311340808868, | |
| "epoch": 1.8544776119402986, | |
| "grad_norm": 0.13412456214427948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295535326004028, | |
| "mean_token_accuracy": 0.7874404042959213, | |
| "num_tokens": 8126795.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5373153984546661, | |
| "epoch": 1.8582089552238807, | |
| "grad_norm": 0.1485511064529419, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427818894386292, | |
| "mean_token_accuracy": 0.7803584039211273, | |
| "num_tokens": 8143234.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.522105023264885, | |
| "epoch": 1.8619402985074627, | |
| "grad_norm": 0.1580716073513031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267635583877563, | |
| "mean_token_accuracy": 0.7869967371225357, | |
| "num_tokens": 8159687.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5215406715869904, | |
| "epoch": 1.8656716417910446, | |
| "grad_norm": 0.1573050171136856, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285288691520691, | |
| "mean_token_accuracy": 0.7851908951997757, | |
| "num_tokens": 8176020.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5404719114303589, | |
| "epoch": 1.8694029850746268, | |
| "grad_norm": 0.1411486119031906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365728735923767, | |
| "mean_token_accuracy": 0.7837002873420715, | |
| "num_tokens": 8192551.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5438470244407654, | |
| "epoch": 1.873134328358209, | |
| "grad_norm": 0.130998432636261, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430339574813843, | |
| "mean_token_accuracy": 0.7819307893514633, | |
| "num_tokens": 8209082.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5403178930282593, | |
| "epoch": 1.876865671641791, | |
| "grad_norm": 0.1385144740343094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460789203643799, | |
| "mean_token_accuracy": 0.7790951728820801, | |
| "num_tokens": 8225744.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5280100554227829, | |
| "epoch": 1.8805970149253732, | |
| "grad_norm": 0.14330939948558807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235118269920349, | |
| "mean_token_accuracy": 0.7890605628490448, | |
| "num_tokens": 8242208.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5532096922397614, | |
| "epoch": 1.8843283582089554, | |
| "grad_norm": 0.1357594132423401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498918890953064, | |
| "mean_token_accuracy": 0.7760927677154541, | |
| "num_tokens": 8258496.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5294792056083679, | |
| "epoch": 1.8880597014925373, | |
| "grad_norm": 0.13375437259674072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297701358795166, | |
| "mean_token_accuracy": 0.7845475971698761, | |
| "num_tokens": 8274536.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5456722378730774, | |
| "epoch": 1.8917910447761193, | |
| "grad_norm": 0.14889481663703918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517223477363586, | |
| "mean_token_accuracy": 0.7756078243255615, | |
| "num_tokens": 8290986.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5214451998472214, | |
| "epoch": 1.8955223880597014, | |
| "grad_norm": 0.13305895030498505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5249897837638855, | |
| "mean_token_accuracy": 0.7870367765426636, | |
| "num_tokens": 8307117.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5336883068084717, | |
| "epoch": 1.8992537313432836, | |
| "grad_norm": 0.13193877041339874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352887511253357, | |
| "mean_token_accuracy": 0.7798391133546829, | |
| "num_tokens": 8323273.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5336564183235168, | |
| "epoch": 1.9029850746268657, | |
| "grad_norm": 0.12489310652017593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302382111549377, | |
| "mean_token_accuracy": 0.7845423817634583, | |
| "num_tokens": 8339385.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5382219552993774, | |
| "epoch": 1.9067164179104479, | |
| "grad_norm": 0.1456049680709839, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372790694236755, | |
| "mean_token_accuracy": 0.782544881105423, | |
| "num_tokens": 8355706.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5403454750776291, | |
| "epoch": 1.9104477611940298, | |
| "grad_norm": 0.12694604694843292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402185320854187, | |
| "mean_token_accuracy": 0.7826471477746964, | |
| "num_tokens": 8372132.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5318908393383026, | |
| "epoch": 1.914179104477612, | |
| "grad_norm": 0.1555122435092926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541782796382904, | |
| "mean_token_accuracy": 0.7774071842432022, | |
| "num_tokens": 8388306.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5221689939498901, | |
| "epoch": 1.917910447761194, | |
| "grad_norm": 0.1543516367673874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357338190078735, | |
| "mean_token_accuracy": 0.7826261073350906, | |
| "num_tokens": 8404876.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5229770094156265, | |
| "epoch": 1.921641791044776, | |
| "grad_norm": 0.13613452017307281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244792699813843, | |
| "mean_token_accuracy": 0.7872123420238495, | |
| "num_tokens": 8421349.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5398612320423126, | |
| "epoch": 1.9253731343283582, | |
| "grad_norm": 0.14049243927001953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422282218933105, | |
| "mean_token_accuracy": 0.7783734500408173, | |
| "num_tokens": 8437774.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5401616841554642, | |
| "epoch": 1.9291044776119404, | |
| "grad_norm": 0.13164237141609192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331213474273682, | |
| "mean_token_accuracy": 0.7848468571901321, | |
| "num_tokens": 8454123.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5214215666055679, | |
| "epoch": 1.9328358208955225, | |
| "grad_norm": 0.13749226927757263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158907175064087, | |
| "mean_token_accuracy": 0.7904626429080963, | |
| "num_tokens": 8470320.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5412722826004028, | |
| "epoch": 1.9365671641791045, | |
| "grad_norm": 0.127340629696846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443693995475769, | |
| "mean_token_accuracy": 0.7785214781761169, | |
| "num_tokens": 8486754.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5276665389537811, | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 0.13310599327087402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311852693557739, | |
| "mean_token_accuracy": 0.7849074453115463, | |
| "num_tokens": 8503273.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5346188247203827, | |
| "epoch": 1.9440298507462686, | |
| "grad_norm": 0.12909531593322754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408310890197754, | |
| "mean_token_accuracy": 0.779103621840477, | |
| "num_tokens": 8519520.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5392955094575882, | |
| "epoch": 1.9477611940298507, | |
| "grad_norm": 0.12654371559619904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376543998718262, | |
| "mean_token_accuracy": 0.7810464203357697, | |
| "num_tokens": 8535688.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.526744157075882, | |
| "epoch": 1.9514925373134329, | |
| "grad_norm": 0.11877280473709106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258936882019043, | |
| "mean_token_accuracy": 0.7875306010246277, | |
| "num_tokens": 8551996.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5467166006565094, | |
| "epoch": 1.955223880597015, | |
| "grad_norm": 0.1407010555267334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389098525047302, | |
| "mean_token_accuracy": 0.7805493026971817, | |
| "num_tokens": 8568202.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5553875267505646, | |
| "epoch": 1.9589552238805972, | |
| "grad_norm": 0.13490191102027893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481207370758057, | |
| "mean_token_accuracy": 0.7784747332334518, | |
| "num_tokens": 8584625.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5178312584757805, | |
| "epoch": 1.962686567164179, | |
| "grad_norm": 0.14236751198768616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5226012468338013, | |
| "mean_token_accuracy": 0.7866991758346558, | |
| "num_tokens": 8600683.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5227778926491737, | |
| "epoch": 1.966417910447761, | |
| "grad_norm": 0.16303445398807526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365378856658936, | |
| "mean_token_accuracy": 0.7807085812091827, | |
| "num_tokens": 8616685.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5410575568675995, | |
| "epoch": 1.9701492537313432, | |
| "grad_norm": 0.16557544469833374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510291457176208, | |
| "mean_token_accuracy": 0.7770103365182877, | |
| "num_tokens": 8633088.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.531767264008522, | |
| "epoch": 1.9738805970149254, | |
| "grad_norm": 0.16024784743785858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305666327476501, | |
| "mean_token_accuracy": 0.7834270149469376, | |
| "num_tokens": 8649322.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5423388332128525, | |
| "epoch": 1.9776119402985075, | |
| "grad_norm": 0.1314675360918045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316357016563416, | |
| "mean_token_accuracy": 0.7857660055160522, | |
| "num_tokens": 8665670.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5405716001987457, | |
| "epoch": 1.9813432835820897, | |
| "grad_norm": 0.1407650113105774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429906845092773, | |
| "mean_token_accuracy": 0.7817323058843613, | |
| "num_tokens": 8681998.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5365249365568161, | |
| "epoch": 1.9850746268656716, | |
| "grad_norm": 0.14180989563465118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345437526702881, | |
| "mean_token_accuracy": 0.7865561246871948, | |
| "num_tokens": 8698483.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5290075689554214, | |
| "epoch": 1.9888059701492538, | |
| "grad_norm": 0.1477176696062088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337146520614624, | |
| "mean_token_accuracy": 0.7824839055538177, | |
| "num_tokens": 8714640.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5333692282438278, | |
| "epoch": 1.9925373134328357, | |
| "grad_norm": 0.17112773656845093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424102544784546, | |
| "mean_token_accuracy": 0.779076337814331, | |
| "num_tokens": 8730887.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5415492355823517, | |
| "epoch": 1.9962686567164178, | |
| "grad_norm": 0.14943642914295197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476213693618774, | |
| "mean_token_accuracy": 0.7769679576158524, | |
| "num_tokens": 8747309.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5581045299768448, | |
| "epoch": 2.0, | |
| "grad_norm": 0.15832063555717468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548263788223267, | |
| "mean_token_accuracy": 0.776277557015419, | |
| "num_tokens": 8763550.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5369964390993118, | |
| "epoch": 2.003731343283582, | |
| "grad_norm": 0.15130668878555298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5179107189178467, | |
| "mean_token_accuracy": 0.7907675057649612, | |
| "num_tokens": 8779922.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5117110908031464, | |
| "epoch": 2.0074626865671643, | |
| "grad_norm": 0.16026535630226135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5020841956138611, | |
| "mean_token_accuracy": 0.7973873615264893, | |
| "num_tokens": 8795988.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5028296113014221, | |
| "epoch": 2.0111940298507465, | |
| "grad_norm": 0.1676231324672699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.51214998960495, | |
| "mean_token_accuracy": 0.7921472936868668, | |
| "num_tokens": 8812261.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5081141889095306, | |
| "epoch": 2.014925373134328, | |
| "grad_norm": 0.21105162799358368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206259489059448, | |
| "mean_token_accuracy": 0.7869252115488052, | |
| "num_tokens": 8828964.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5053770169615746, | |
| "epoch": 2.0186567164179103, | |
| "grad_norm": 0.1996072232723236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146310329437256, | |
| "mean_token_accuracy": 0.7916830629110336, | |
| "num_tokens": 8845583.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5284380093216896, | |
| "epoch": 2.0223880597014925, | |
| "grad_norm": 0.14588730037212372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5199918150901794, | |
| "mean_token_accuracy": 0.7893239259719849, | |
| "num_tokens": 8861873.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5435770899057388, | |
| "epoch": 2.0261194029850746, | |
| "grad_norm": 0.14907799661159515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536811113357544, | |
| "mean_token_accuracy": 0.7802763283252716, | |
| "num_tokens": 8878456.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5174986571073532, | |
| "epoch": 2.029850746268657, | |
| "grad_norm": 0.14996512234210968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5144167542457581, | |
| "mean_token_accuracy": 0.7930785864591599, | |
| "num_tokens": 8894797.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5272421538829803, | |
| "epoch": 2.033582089552239, | |
| "grad_norm": 0.16765476763248444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306269526481628, | |
| "mean_token_accuracy": 0.7856330573558807, | |
| "num_tokens": 8911217.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.49972501397132874, | |
| "epoch": 2.0373134328358207, | |
| "grad_norm": 0.1322057694196701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5012874603271484, | |
| "mean_token_accuracy": 0.7979290634393692, | |
| "num_tokens": 8927511.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5031155720353127, | |
| "epoch": 2.041044776119403, | |
| "grad_norm": 0.16402538120746613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5100584626197815, | |
| "mean_token_accuracy": 0.7926298826932907, | |
| "num_tokens": 8943509.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5090021565556526, | |
| "epoch": 2.044776119402985, | |
| "grad_norm": 0.1516626924276352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.51352858543396, | |
| "mean_token_accuracy": 0.7925879657268524, | |
| "num_tokens": 8959744.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.4990556240081787, | |
| "epoch": 2.048507462686567, | |
| "grad_norm": 0.14189165830612183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5032692551612854, | |
| "mean_token_accuracy": 0.7943097651004791, | |
| "num_tokens": 8976001.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5276429355144501, | |
| "epoch": 2.0522388059701493, | |
| "grad_norm": 0.13545501232147217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5224078893661499, | |
| "mean_token_accuracy": 0.7892052680253983, | |
| "num_tokens": 8992265.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5246792286634445, | |
| "epoch": 2.0559701492537314, | |
| "grad_norm": 0.15987011790275574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220500230789185, | |
| "mean_token_accuracy": 0.7897221744060516, | |
| "num_tokens": 9008612.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5142855197191238, | |
| "epoch": 2.0597014925373136, | |
| "grad_norm": 0.17870153486728668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5103524923324585, | |
| "mean_token_accuracy": 0.7925411611795425, | |
| "num_tokens": 9025112.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5080101564526558, | |
| "epoch": 2.0634328358208953, | |
| "grad_norm": 0.19365249574184418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5135321617126465, | |
| "mean_token_accuracy": 0.792420819401741, | |
| "num_tokens": 9041825.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5249690413475037, | |
| "epoch": 2.0671641791044775, | |
| "grad_norm": 0.17408262193202972, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527820348739624, | |
| "mean_token_accuracy": 0.7850991487503052, | |
| "num_tokens": 9058218.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5355798751115799, | |
| "epoch": 2.0708955223880596, | |
| "grad_norm": 0.17400678992271423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327027440071106, | |
| "mean_token_accuracy": 0.7834015786647797, | |
| "num_tokens": 9074538.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5193932577967644, | |
| "epoch": 2.074626865671642, | |
| "grad_norm": 0.19260965287685394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203508138656616, | |
| "mean_token_accuracy": 0.7900512516498566, | |
| "num_tokens": 9090645.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5282454341650009, | |
| "epoch": 2.078358208955224, | |
| "grad_norm": 0.17010283470153809, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296856760978699, | |
| "mean_token_accuracy": 0.7844990193843842, | |
| "num_tokens": 9107205.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5335307121276855, | |
| "epoch": 2.082089552238806, | |
| "grad_norm": 0.18085786700248718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380091667175293, | |
| "mean_token_accuracy": 0.7830383628606796, | |
| "num_tokens": 9123633.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5050861239433289, | |
| "epoch": 2.0858208955223883, | |
| "grad_norm": 0.1828233301639557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5116996169090271, | |
| "mean_token_accuracy": 0.7909363359212875, | |
| "num_tokens": 9139672.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5233924090862274, | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.1721849888563156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234174728393555, | |
| "mean_token_accuracy": 0.7887046784162521, | |
| "num_tokens": 9156329.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5096859857439995, | |
| "epoch": 2.093283582089552, | |
| "grad_norm": 0.13895049691200256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5016306638717651, | |
| "mean_token_accuracy": 0.7958591133356094, | |
| "num_tokens": 9172549.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5022074803709984, | |
| "epoch": 2.0970149253731343, | |
| "grad_norm": 0.18107853829860687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49785315990448, | |
| "mean_token_accuracy": 0.7988625317811966, | |
| "num_tokens": 9188916.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.49919093400239944, | |
| "epoch": 2.1007462686567164, | |
| "grad_norm": 0.18361544609069824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5069372057914734, | |
| "mean_token_accuracy": 0.7953463643789291, | |
| "num_tokens": 9205116.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5179380178451538, | |
| "epoch": 2.1044776119402986, | |
| "grad_norm": 0.17814478278160095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5233405232429504, | |
| "mean_token_accuracy": 0.7879672199487686, | |
| "num_tokens": 9221422.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5209343507885933, | |
| "epoch": 2.1082089552238807, | |
| "grad_norm": 0.16368801891803741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220014452934265, | |
| "mean_token_accuracy": 0.7900985032320023, | |
| "num_tokens": 9237878.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5203168541193008, | |
| "epoch": 2.111940298507463, | |
| "grad_norm": 0.18038009107112885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181905627250671, | |
| "mean_token_accuracy": 0.7902995347976685, | |
| "num_tokens": 9254207.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5203139036893845, | |
| "epoch": 2.1156716417910446, | |
| "grad_norm": 0.15972773730754852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5092154145240784, | |
| "mean_token_accuracy": 0.793173611164093, | |
| "num_tokens": 9270204.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5298740118741989, | |
| "epoch": 2.1194029850746268, | |
| "grad_norm": 0.16917745769023895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521593451499939, | |
| "mean_token_accuracy": 0.789896160364151, | |
| "num_tokens": 9286472.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5120234042406082, | |
| "epoch": 2.123134328358209, | |
| "grad_norm": 0.1817537248134613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5180550813674927, | |
| "mean_token_accuracy": 0.7886006981134415, | |
| "num_tokens": 9302801.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5053592845797539, | |
| "epoch": 2.126865671641791, | |
| "grad_norm": 0.17402999103069305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5133467316627502, | |
| "mean_token_accuracy": 0.7945185601711273, | |
| "num_tokens": 9318994.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5077695101499557, | |
| "epoch": 2.1305970149253732, | |
| "grad_norm": 0.1826324611902237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5111861228942871, | |
| "mean_token_accuracy": 0.7935459464788437, | |
| "num_tokens": 9335440.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5085733756422997, | |
| "epoch": 2.1343283582089554, | |
| "grad_norm": 0.20258648693561554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5162274837493896, | |
| "mean_token_accuracy": 0.7936873137950897, | |
| "num_tokens": 9351752.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5466553270816803, | |
| "epoch": 2.138059701492537, | |
| "grad_norm": 0.21011336147785187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393267273902893, | |
| "mean_token_accuracy": 0.7812587320804596, | |
| "num_tokens": 9368219.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5103291645646095, | |
| "epoch": 2.1417910447761193, | |
| "grad_norm": 0.16960836946964264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5084283351898193, | |
| "mean_token_accuracy": 0.7936739772558212, | |
| "num_tokens": 9384590.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5131630301475525, | |
| "epoch": 2.1455223880597014, | |
| "grad_norm": 0.17001323401927948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5123889446258545, | |
| "mean_token_accuracy": 0.7904325425624847, | |
| "num_tokens": 9400768.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5091337114572525, | |
| "epoch": 2.1492537313432836, | |
| "grad_norm": 0.19518889486789703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.512664794921875, | |
| "mean_token_accuracy": 0.7909765988588333, | |
| "num_tokens": 9416962.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.506959430873394, | |
| "epoch": 2.1529850746268657, | |
| "grad_norm": 0.19361013174057007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5145208835601807, | |
| "mean_token_accuracy": 0.7909970581531525, | |
| "num_tokens": 9433273.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5075285658240318, | |
| "epoch": 2.156716417910448, | |
| "grad_norm": 0.20014171302318573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5108210444450378, | |
| "mean_token_accuracy": 0.795252114534378, | |
| "num_tokens": 9449764.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5293942838907242, | |
| "epoch": 2.16044776119403, | |
| "grad_norm": 0.1974441111087799, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285412669181824, | |
| "mean_token_accuracy": 0.7868294268846512, | |
| "num_tokens": 9466170.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5336958318948746, | |
| "epoch": 2.1641791044776117, | |
| "grad_norm": 0.16498853266239166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246227383613586, | |
| "mean_token_accuracy": 0.7904203087091446, | |
| "num_tokens": 9482671.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5340626388788223, | |
| "epoch": 2.167910447761194, | |
| "grad_norm": 0.16569171845912933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292053818702698, | |
| "mean_token_accuracy": 0.7861965000629425, | |
| "num_tokens": 9499134.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5213732421398163, | |
| "epoch": 2.171641791044776, | |
| "grad_norm": 0.191435769200325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527378499507904, | |
| "mean_token_accuracy": 0.7864173054695129, | |
| "num_tokens": 9515505.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5035439431667328, | |
| "epoch": 2.175373134328358, | |
| "grad_norm": 0.1665230244398117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5038704872131348, | |
| "mean_token_accuracy": 0.7968962043523788, | |
| "num_tokens": 9532118.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5060234367847443, | |
| "epoch": 2.1791044776119404, | |
| "grad_norm": 0.16969595849514008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5113446712493896, | |
| "mean_token_accuracy": 0.7920107841491699, | |
| "num_tokens": 9548351.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5291168391704559, | |
| "epoch": 2.1828358208955225, | |
| "grad_norm": 0.16809239983558655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360448360443115, | |
| "mean_token_accuracy": 0.7811578214168549, | |
| "num_tokens": 9564913.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5199222788214684, | |
| "epoch": 2.1865671641791047, | |
| "grad_norm": 0.15394440293312073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5177597403526306, | |
| "mean_token_accuracy": 0.7905119061470032, | |
| "num_tokens": 9581583.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5282980501651764, | |
| "epoch": 2.1902985074626864, | |
| "grad_norm": 0.17473557591438293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527908980846405, | |
| "mean_token_accuracy": 0.7872945964336395, | |
| "num_tokens": 9598262.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5268830358982086, | |
| "epoch": 2.1940298507462686, | |
| "grad_norm": 0.16386888921260834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5233091711997986, | |
| "mean_token_accuracy": 0.788049191236496, | |
| "num_tokens": 9614535.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5275766104459763, | |
| "epoch": 2.1977611940298507, | |
| "grad_norm": 0.17853675782680511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314985513687134, | |
| "mean_token_accuracy": 0.7853439450263977, | |
| "num_tokens": 9630730.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5230407416820526, | |
| "epoch": 2.201492537313433, | |
| "grad_norm": 0.18614573776721954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324023365974426, | |
| "mean_token_accuracy": 0.7870204299688339, | |
| "num_tokens": 9647367.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5045590102672577, | |
| "epoch": 2.205223880597015, | |
| "grad_norm": 0.16460436582565308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5095564723014832, | |
| "mean_token_accuracy": 0.7933550179004669, | |
| "num_tokens": 9663807.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5061227604746819, | |
| "epoch": 2.208955223880597, | |
| "grad_norm": 0.1727134734392166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.50539630651474, | |
| "mean_token_accuracy": 0.79543037712574, | |
| "num_tokens": 9679957.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5444381237030029, | |
| "epoch": 2.2126865671641793, | |
| "grad_norm": 0.1631772667169571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421435832977295, | |
| "mean_token_accuracy": 0.7804461270570755, | |
| "num_tokens": 9696269.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5140876695513725, | |
| "epoch": 2.216417910447761, | |
| "grad_norm": 0.14234963059425354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5083339214324951, | |
| "mean_token_accuracy": 0.7940346747636795, | |
| "num_tokens": 9712614.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5227879285812378, | |
| "epoch": 2.220149253731343, | |
| "grad_norm": 0.1700550764799118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256499648094177, | |
| "mean_token_accuracy": 0.788642093539238, | |
| "num_tokens": 9729090.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5193727314472198, | |
| "epoch": 2.2238805970149254, | |
| "grad_norm": 0.16189917922019958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.515200674533844, | |
| "mean_token_accuracy": 0.7933167964220047, | |
| "num_tokens": 9745602.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5037901103496552, | |
| "epoch": 2.2276119402985075, | |
| "grad_norm": 0.15295493602752686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5038392543792725, | |
| "mean_token_accuracy": 0.7972543388605118, | |
| "num_tokens": 9761880.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5051177442073822, | |
| "epoch": 2.2313432835820897, | |
| "grad_norm": 0.18619783222675323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5126343369483948, | |
| "mean_token_accuracy": 0.794564738869667, | |
| "num_tokens": 9778073.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5051270872354507, | |
| "epoch": 2.235074626865672, | |
| "grad_norm": 0.1611267328262329, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5092532634735107, | |
| "mean_token_accuracy": 0.7946549952030182, | |
| "num_tokens": 9794345.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5325346812605858, | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 0.20552673935890198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378585457801819, | |
| "mean_token_accuracy": 0.7835244834423065, | |
| "num_tokens": 9810716.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5362858921289444, | |
| "epoch": 2.2425373134328357, | |
| "grad_norm": 0.1832580715417862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247851014137268, | |
| "mean_token_accuracy": 0.7862047404050827, | |
| "num_tokens": 9826899.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.515026330947876, | |
| "epoch": 2.246268656716418, | |
| "grad_norm": 0.1738833785057068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5104220509529114, | |
| "mean_token_accuracy": 0.7956585586071014, | |
| "num_tokens": 9843201.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5326243042945862, | |
| "epoch": 2.25, | |
| "grad_norm": 0.19789133965969086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377206206321716, | |
| "mean_token_accuracy": 0.7844580560922623, | |
| "num_tokens": 9859428.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5045425221323967, | |
| "epoch": 2.253731343283582, | |
| "grad_norm": 0.22017110884189606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5142727494239807, | |
| "mean_token_accuracy": 0.7916774153709412, | |
| "num_tokens": 9875509.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5083225071430206, | |
| "epoch": 2.2574626865671643, | |
| "grad_norm": 0.20720691978931427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5168294906616211, | |
| "mean_token_accuracy": 0.7916733622550964, | |
| "num_tokens": 9891513.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5038861483335495, | |
| "epoch": 2.2611940298507465, | |
| "grad_norm": 0.22461913526058197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155696868896484, | |
| "mean_token_accuracy": 0.7936981916427612, | |
| "num_tokens": 9907970.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.544201672077179, | |
| "epoch": 2.264925373134328, | |
| "grad_norm": 0.22078122198581696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377649664878845, | |
| "mean_token_accuracy": 0.7846001982688904, | |
| "num_tokens": 9924358.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5319496989250183, | |
| "epoch": 2.2686567164179103, | |
| "grad_norm": 0.15865834057331085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269988775253296, | |
| "mean_token_accuracy": 0.7889304012060165, | |
| "num_tokens": 9940613.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5121538639068604, | |
| "epoch": 2.2723880597014925, | |
| "grad_norm": 0.19707661867141724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5115834474563599, | |
| "mean_token_accuracy": 0.7899812310934067, | |
| "num_tokens": 9956900.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5339771807193756, | |
| "epoch": 2.2761194029850746, | |
| "grad_norm": 0.15257956087589264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300955772399902, | |
| "mean_token_accuracy": 0.785103976726532, | |
| "num_tokens": 9973499.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5281384140253067, | |
| "epoch": 2.279850746268657, | |
| "grad_norm": 0.16553470492362976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257382392883301, | |
| "mean_token_accuracy": 0.7875041514635086, | |
| "num_tokens": 9989801.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5170317441225052, | |
| "epoch": 2.283582089552239, | |
| "grad_norm": 0.1715046465396881, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181665420532227, | |
| "mean_token_accuracy": 0.7884780019521713, | |
| "num_tokens": 10006078.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5153259709477425, | |
| "epoch": 2.2873134328358207, | |
| "grad_norm": 0.1548839956521988, | |
| "learning_rate": 0.0002, | |
| "loss": 0.514171302318573, | |
| "mean_token_accuracy": 0.7930748611688614, | |
| "num_tokens": 10022246.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5224331915378571, | |
| "epoch": 2.291044776119403, | |
| "grad_norm": 0.1681355983018875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221542119979858, | |
| "mean_token_accuracy": 0.7877352833747864, | |
| "num_tokens": 10038788.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5205291956663132, | |
| "epoch": 2.294776119402985, | |
| "grad_norm": 0.16179999709129333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216364860534668, | |
| "mean_token_accuracy": 0.7894330769777298, | |
| "num_tokens": 10055226.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5362520515918732, | |
| "epoch": 2.298507462686567, | |
| "grad_norm": 0.19491799175739288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382164716720581, | |
| "mean_token_accuracy": 0.7841734141111374, | |
| "num_tokens": 10071636.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5122754499316216, | |
| "epoch": 2.3022388059701493, | |
| "grad_norm": 0.15888278186321259, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5128467082977295, | |
| "mean_token_accuracy": 0.7957093715667725, | |
| "num_tokens": 10087915.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.530030369758606, | |
| "epoch": 2.3059701492537314, | |
| "grad_norm": 0.20173799991607666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327577590942383, | |
| "mean_token_accuracy": 0.7822887450456619, | |
| "num_tokens": 10104328.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.511964850127697, | |
| "epoch": 2.3097014925373136, | |
| "grad_norm": 0.22716699540615082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194392800331116, | |
| "mean_token_accuracy": 0.7923955619335175, | |
| "num_tokens": 10120902.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5184068530797958, | |
| "epoch": 2.3134328358208958, | |
| "grad_norm": 0.1653965413570404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5168477892875671, | |
| "mean_token_accuracy": 0.7927787899971008, | |
| "num_tokens": 10137330.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5173092186450958, | |
| "epoch": 2.3171641791044775, | |
| "grad_norm": 0.1853804737329483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189480781555176, | |
| "mean_token_accuracy": 0.7897288352251053, | |
| "num_tokens": 10153802.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5215531587600708, | |
| "epoch": 2.3208955223880596, | |
| "grad_norm": 0.1907532960176468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235369801521301, | |
| "mean_token_accuracy": 0.7906839698553085, | |
| "num_tokens": 10170052.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5299772173166275, | |
| "epoch": 2.324626865671642, | |
| "grad_norm": 0.17518973350524902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5251893401145935, | |
| "mean_token_accuracy": 0.7905509769916534, | |
| "num_tokens": 10186299.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5111118629574776, | |
| "epoch": 2.328358208955224, | |
| "grad_norm": 0.162562295794487, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5044469237327576, | |
| "mean_token_accuracy": 0.793881356716156, | |
| "num_tokens": 10202479.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5176884084939957, | |
| "epoch": 2.332089552238806, | |
| "grad_norm": 0.15817266702651978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189487934112549, | |
| "mean_token_accuracy": 0.7899019569158554, | |
| "num_tokens": 10218755.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5375020056962967, | |
| "epoch": 2.3358208955223883, | |
| "grad_norm": 0.16503086686134338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378777980804443, | |
| "mean_token_accuracy": 0.7797044813632965, | |
| "num_tokens": 10235308.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5069606155157089, | |
| "epoch": 2.33955223880597, | |
| "grad_norm": 0.19356752932071686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5149304866790771, | |
| "mean_token_accuracy": 0.790899932384491, | |
| "num_tokens": 10251410.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5025136545300484, | |
| "epoch": 2.343283582089552, | |
| "grad_norm": 0.1775875836610794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5070807933807373, | |
| "mean_token_accuracy": 0.7955823987722397, | |
| "num_tokens": 10267499.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5052608847618103, | |
| "epoch": 2.3470149253731343, | |
| "grad_norm": 0.21965590119361877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5101135969161987, | |
| "mean_token_accuracy": 0.7949910014867783, | |
| "num_tokens": 10283791.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5179193317890167, | |
| "epoch": 2.3507462686567164, | |
| "grad_norm": 0.19963982701301575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5215207934379578, | |
| "mean_token_accuracy": 0.7893756926059723, | |
| "num_tokens": 10299845.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5158931389451027, | |
| "epoch": 2.3544776119402986, | |
| "grad_norm": 0.160457581281662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5119190216064453, | |
| "mean_token_accuracy": 0.7945539355278015, | |
| "num_tokens": 10316272.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5080019608139992, | |
| "epoch": 2.3582089552238807, | |
| "grad_norm": 0.1729355752468109, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5050552487373352, | |
| "mean_token_accuracy": 0.7989319264888763, | |
| "num_tokens": 10332919.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5174911320209503, | |
| "epoch": 2.361940298507463, | |
| "grad_norm": 0.1741209179162979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234130024909973, | |
| "mean_token_accuracy": 0.7888159304857254, | |
| "num_tokens": 10349259.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5265702903270721, | |
| "epoch": 2.3656716417910446, | |
| "grad_norm": 0.19182217121124268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293515920639038, | |
| "mean_token_accuracy": 0.7829533070325851, | |
| "num_tokens": 10365491.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5425137877464294, | |
| "epoch": 2.3694029850746268, | |
| "grad_norm": 0.16463470458984375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542192280292511, | |
| "mean_token_accuracy": 0.7816719859838486, | |
| "num_tokens": 10381847.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5144196897745132, | |
| "epoch": 2.373134328358209, | |
| "grad_norm": 0.16132977604866028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5131939053535461, | |
| "mean_token_accuracy": 0.7919805645942688, | |
| "num_tokens": 10398171.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5415032058954239, | |
| "epoch": 2.376865671641791, | |
| "grad_norm": 0.16324372589588165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371772050857544, | |
| "mean_token_accuracy": 0.7831342816352844, | |
| "num_tokens": 10414686.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5282690078020096, | |
| "epoch": 2.3805970149253732, | |
| "grad_norm": 0.17967335879802704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203690528869629, | |
| "mean_token_accuracy": 0.7885807305574417, | |
| "num_tokens": 10431126.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5216360539197922, | |
| "epoch": 2.3843283582089554, | |
| "grad_norm": 0.16235722601413727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236966013908386, | |
| "mean_token_accuracy": 0.7884224951267242, | |
| "num_tokens": 10447324.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5296328365802765, | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.1916787028312683, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376251339912415, | |
| "mean_token_accuracy": 0.7802027314901352, | |
| "num_tokens": 10463603.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5012985095381737, | |
| "epoch": 2.3917910447761193, | |
| "grad_norm": 0.19376890361309052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5101221203804016, | |
| "mean_token_accuracy": 0.7951995581388474, | |
| "num_tokens": 10479993.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5038901194930077, | |
| "epoch": 2.3955223880597014, | |
| "grad_norm": 0.17371249198913574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146278738975525, | |
| "mean_token_accuracy": 0.7905002534389496, | |
| "num_tokens": 10496023.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5509473532438278, | |
| "epoch": 2.3992537313432836, | |
| "grad_norm": 0.15395016968250275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546664834022522, | |
| "mean_token_accuracy": 0.7777733653783798, | |
| "num_tokens": 10512527.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5174002125859261, | |
| "epoch": 2.4029850746268657, | |
| "grad_norm": 0.1537095606327057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5125638842582703, | |
| "mean_token_accuracy": 0.7953683733940125, | |
| "num_tokens": 10529050.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5259301066398621, | |
| "epoch": 2.406716417910448, | |
| "grad_norm": 0.19275200366973877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534030556678772, | |
| "mean_token_accuracy": 0.7856698781251907, | |
| "num_tokens": 10545403.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5141283497214317, | |
| "epoch": 2.41044776119403, | |
| "grad_norm": 0.2044205218553543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5202509760856628, | |
| "mean_token_accuracy": 0.7915003001689911, | |
| "num_tokens": 10561404.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5140255615115166, | |
| "epoch": 2.4141791044776117, | |
| "grad_norm": 0.17939844727516174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5115104913711548, | |
| "mean_token_accuracy": 0.7907571196556091, | |
| "num_tokens": 10577588.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5283705443143845, | |
| "epoch": 2.417910447761194, | |
| "grad_norm": 0.19888189435005188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198178291320801, | |
| "mean_token_accuracy": 0.7891141772270203, | |
| "num_tokens": 10593859.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5462386906147003, | |
| "epoch": 2.421641791044776, | |
| "grad_norm": 0.1922907531261444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396484732627869, | |
| "mean_token_accuracy": 0.7813579887151718, | |
| "num_tokens": 10610303.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5058758109807968, | |
| "epoch": 2.425373134328358, | |
| "grad_norm": 0.21254123747348785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5134891271591187, | |
| "mean_token_accuracy": 0.7951326668262482, | |
| "num_tokens": 10626628.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5051485821604729, | |
| "epoch": 2.4291044776119404, | |
| "grad_norm": 0.17681139707565308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5095136761665344, | |
| "mean_token_accuracy": 0.7927682101726532, | |
| "num_tokens": 10642872.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5098261535167694, | |
| "epoch": 2.4328358208955225, | |
| "grad_norm": 0.1644936203956604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5163934230804443, | |
| "mean_token_accuracy": 0.7900458127260208, | |
| "num_tokens": 10659143.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5026194378733635, | |
| "epoch": 2.4365671641791042, | |
| "grad_norm": 0.1890725940465927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.511451244354248, | |
| "mean_token_accuracy": 0.7927152365446091, | |
| "num_tokens": 10675503.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5148562490940094, | |
| "epoch": 2.4402985074626864, | |
| "grad_norm": 0.1650211215019226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5156391263008118, | |
| "mean_token_accuracy": 0.7906764894723892, | |
| "num_tokens": 10691795.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5057827532291412, | |
| "epoch": 2.4440298507462686, | |
| "grad_norm": 0.1589452177286148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5033491849899292, | |
| "mean_token_accuracy": 0.7994053959846497, | |
| "num_tokens": 10707762.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5219250470399857, | |
| "epoch": 2.4477611940298507, | |
| "grad_norm": 0.18478544056415558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219628810882568, | |
| "mean_token_accuracy": 0.7873866856098175, | |
| "num_tokens": 10724063.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5177232921123505, | |
| "epoch": 2.451492537313433, | |
| "grad_norm": 0.17303429543972015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200316309928894, | |
| "mean_token_accuracy": 0.7885988503694534, | |
| "num_tokens": 10740399.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5319043695926666, | |
| "epoch": 2.455223880597015, | |
| "grad_norm": 0.18429186940193176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326516032218933, | |
| "mean_token_accuracy": 0.7862447798252106, | |
| "num_tokens": 10756986.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5453691333532333, | |
| "epoch": 2.458955223880597, | |
| "grad_norm": 0.16711914539337158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386096239089966, | |
| "mean_token_accuracy": 0.7812793850898743, | |
| "num_tokens": 10773458.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5214618891477585, | |
| "epoch": 2.4626865671641793, | |
| "grad_norm": 0.1909995675086975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.518884003162384, | |
| "mean_token_accuracy": 0.7878068089485168, | |
| "num_tokens": 10789818.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.523200586438179, | |
| "epoch": 2.466417910447761, | |
| "grad_norm": 0.17626361548900604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5212401151657104, | |
| "mean_token_accuracy": 0.7900760471820831, | |
| "num_tokens": 10806143.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5310025811195374, | |
| "epoch": 2.470149253731343, | |
| "grad_norm": 0.24172359704971313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338881611824036, | |
| "mean_token_accuracy": 0.7858817130327225, | |
| "num_tokens": 10822437.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5151319652795792, | |
| "epoch": 2.4738805970149254, | |
| "grad_norm": 0.19658994674682617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5139521956443787, | |
| "mean_token_accuracy": 0.7917647659778595, | |
| "num_tokens": 10838442.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5117574036121368, | |
| "epoch": 2.4776119402985075, | |
| "grad_norm": 0.2189301699399948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.513599693775177, | |
| "mean_token_accuracy": 0.7897299826145172, | |
| "num_tokens": 10854797.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5397205054759979, | |
| "epoch": 2.4813432835820897, | |
| "grad_norm": 0.2076101452112198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459029078483582, | |
| "mean_token_accuracy": 0.7777052521705627, | |
| "num_tokens": 10871117.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.525243952870369, | |
| "epoch": 2.485074626865672, | |
| "grad_norm": 0.1969526708126068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259374380111694, | |
| "mean_token_accuracy": 0.7870301008224487, | |
| "num_tokens": 10887285.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.521914929151535, | |
| "epoch": 2.4888059701492535, | |
| "grad_norm": 0.1793866604566574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523249626159668, | |
| "mean_token_accuracy": 0.7908923327922821, | |
| "num_tokens": 10903583.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5157094374299049, | |
| "epoch": 2.4925373134328357, | |
| "grad_norm": 0.1676340252161026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196658372879028, | |
| "mean_token_accuracy": 0.7936161011457443, | |
| "num_tokens": 10919876.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.49876970052719116, | |
| "epoch": 2.496268656716418, | |
| "grad_norm": 0.18448136746883392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49738743901252747, | |
| "mean_token_accuracy": 0.8003499060869217, | |
| "num_tokens": 10936091.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5243137031793594, | |
| "epoch": 2.5, | |
| "grad_norm": 0.1985243260860443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526336133480072, | |
| "mean_token_accuracy": 0.7861499488353729, | |
| "num_tokens": 10952522.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5277926176786423, | |
| "epoch": 2.503731343283582, | |
| "grad_norm": 0.15664395689964294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211771726608276, | |
| "mean_token_accuracy": 0.7905664294958115, | |
| "num_tokens": 10968886.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5109870582818985, | |
| "epoch": 2.5074626865671643, | |
| "grad_norm": 0.17840486764907837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5104790925979614, | |
| "mean_token_accuracy": 0.7953955680131912, | |
| "num_tokens": 10985258.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.4981943815946579, | |
| "epoch": 2.5111940298507465, | |
| "grad_norm": 0.15788039565086365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5019396543502808, | |
| "mean_token_accuracy": 0.7957722395658493, | |
| "num_tokens": 11001537.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.4992476552724838, | |
| "epoch": 2.5149253731343286, | |
| "grad_norm": 0.20122262835502625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5123214721679688, | |
| "mean_token_accuracy": 0.7936280071735382, | |
| "num_tokens": 11017858.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5326351076364517, | |
| "epoch": 2.5186567164179103, | |
| "grad_norm": 0.15370923280715942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299698114395142, | |
| "mean_token_accuracy": 0.7864175289869308, | |
| "num_tokens": 11034251.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5276974588632584, | |
| "epoch": 2.5223880597014925, | |
| "grad_norm": 0.16408182680606842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256198644638062, | |
| "mean_token_accuracy": 0.7864832729101181, | |
| "num_tokens": 11050538.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5174605995416641, | |
| "epoch": 2.5261194029850746, | |
| "grad_norm": 0.1726282238960266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5166889429092407, | |
| "mean_token_accuracy": 0.7903372198343277, | |
| "num_tokens": 11066909.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5096773952245712, | |
| "epoch": 2.529850746268657, | |
| "grad_norm": 0.18736550211906433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5147178173065186, | |
| "mean_token_accuracy": 0.7915707528591156, | |
| "num_tokens": 11083296.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5143576934933662, | |
| "epoch": 2.533582089552239, | |
| "grad_norm": 0.18496522307395935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5202215909957886, | |
| "mean_token_accuracy": 0.7876331657171249, | |
| "num_tokens": 11099735.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5062269270420074, | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 0.18014365434646606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5091406106948853, | |
| "mean_token_accuracy": 0.7964621633291245, | |
| "num_tokens": 11116208.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5146580412983894, | |
| "epoch": 2.541044776119403, | |
| "grad_norm": 0.15533168613910675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158394575119019, | |
| "mean_token_accuracy": 0.7913824915885925, | |
| "num_tokens": 11132744.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5299884453415871, | |
| "epoch": 2.544776119402985, | |
| "grad_norm": 0.19397816061973572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282403826713562, | |
| "mean_token_accuracy": 0.7865999937057495, | |
| "num_tokens": 11149385.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5197403728961945, | |
| "epoch": 2.548507462686567, | |
| "grad_norm": 0.1893748939037323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172282457351685, | |
| "mean_token_accuracy": 0.7889421880245209, | |
| "num_tokens": 11165536.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5483877509832382, | |
| "epoch": 2.5522388059701493, | |
| "grad_norm": 0.1692439764738083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408689975738525, | |
| "mean_token_accuracy": 0.7819931209087372, | |
| "num_tokens": 11182199.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5187435150146484, | |
| "epoch": 2.5559701492537314, | |
| "grad_norm": 0.16838251054286957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220701098442078, | |
| "mean_token_accuracy": 0.7913226187229156, | |
| "num_tokens": 11198351.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5129819363355637, | |
| "epoch": 2.5597014925373136, | |
| "grad_norm": 0.18473690748214722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5199850797653198, | |
| "mean_token_accuracy": 0.7907718271017075, | |
| "num_tokens": 11214899.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5174092352390289, | |
| "epoch": 2.5634328358208958, | |
| "grad_norm": 0.18355096876621246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231988430023193, | |
| "mean_token_accuracy": 0.7854581624269485, | |
| "num_tokens": 11231316.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5146564170718193, | |
| "epoch": 2.5671641791044775, | |
| "grad_norm": 0.20094642043113708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5167846083641052, | |
| "mean_token_accuracy": 0.7892555296421051, | |
| "num_tokens": 11247525.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5073134675621986, | |
| "epoch": 2.5708955223880596, | |
| "grad_norm": 0.17776694893836975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5059224963188171, | |
| "mean_token_accuracy": 0.7938186377286911, | |
| "num_tokens": 11263630.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.51164161413908, | |
| "epoch": 2.574626865671642, | |
| "grad_norm": 0.23441171646118164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5132524371147156, | |
| "mean_token_accuracy": 0.7924985736608505, | |
| "num_tokens": 11279891.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5324152410030365, | |
| "epoch": 2.578358208955224, | |
| "grad_norm": 0.1964472234249115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321142673492432, | |
| "mean_token_accuracy": 0.7884731590747833, | |
| "num_tokens": 11296194.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5136373415589333, | |
| "epoch": 2.582089552238806, | |
| "grad_norm": 0.23449179530143738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196998715400696, | |
| "mean_token_accuracy": 0.7908406853675842, | |
| "num_tokens": 11312615.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5276090502738953, | |
| "epoch": 2.585820895522388, | |
| "grad_norm": 0.16686299443244934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247229337692261, | |
| "mean_token_accuracy": 0.7879517525434494, | |
| "num_tokens": 11329158.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5419809222221375, | |
| "epoch": 2.58955223880597, | |
| "grad_norm": 0.19849538803100586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328899621963501, | |
| "mean_token_accuracy": 0.7848672121763229, | |
| "num_tokens": 11345724.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5273312255740166, | |
| "epoch": 2.593283582089552, | |
| "grad_norm": 0.15091370046138763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279825925827026, | |
| "mean_token_accuracy": 0.7853807210922241, | |
| "num_tokens": 11362189.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5198656767606735, | |
| "epoch": 2.5970149253731343, | |
| "grad_norm": 0.23191620409488678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321477651596069, | |
| "mean_token_accuracy": 0.7849823385477066, | |
| "num_tokens": 11378807.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5051373466849327, | |
| "epoch": 2.6007462686567164, | |
| "grad_norm": 0.16530166566371918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5118955373764038, | |
| "mean_token_accuracy": 0.7921792417764664, | |
| "num_tokens": 11395066.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5375550240278244, | |
| "epoch": 2.6044776119402986, | |
| "grad_norm": 0.16651837527751923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333649516105652, | |
| "mean_token_accuracy": 0.7834018468856812, | |
| "num_tokens": 11411502.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.509097121655941, | |
| "epoch": 2.6082089552238807, | |
| "grad_norm": 0.19326747953891754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5079880952835083, | |
| "mean_token_accuracy": 0.7902690321207047, | |
| "num_tokens": 11427527.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5243344008922577, | |
| "epoch": 2.611940298507463, | |
| "grad_norm": 0.17708131670951843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527232825756073, | |
| "mean_token_accuracy": 0.78766830265522, | |
| "num_tokens": 11443934.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5099955424666405, | |
| "epoch": 2.6156716417910446, | |
| "grad_norm": 0.22393395006656647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181647539138794, | |
| "mean_token_accuracy": 0.7911688387393951, | |
| "num_tokens": 11460041.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5081977397203445, | |
| "epoch": 2.6194029850746268, | |
| "grad_norm": 0.19041450321674347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169417262077332, | |
| "mean_token_accuracy": 0.7914475202560425, | |
| "num_tokens": 11476118.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.531707689166069, | |
| "epoch": 2.623134328358209, | |
| "grad_norm": 0.1838483214378357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5199188590049744, | |
| "mean_token_accuracy": 0.7899897545576096, | |
| "num_tokens": 11492660.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5364825427532196, | |
| "epoch": 2.626865671641791, | |
| "grad_norm": 0.1751444786787033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356893539428711, | |
| "mean_token_accuracy": 0.7835856378078461, | |
| "num_tokens": 11509081.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5187056511640549, | |
| "epoch": 2.6305970149253732, | |
| "grad_norm": 0.17921118438243866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232405066490173, | |
| "mean_token_accuracy": 0.7884531170129776, | |
| "num_tokens": 11525499.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5242651104927063, | |
| "epoch": 2.6343283582089554, | |
| "grad_norm": 0.18693575263023376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285453796386719, | |
| "mean_token_accuracy": 0.786514088511467, | |
| "num_tokens": 11541734.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.516477108001709, | |
| "epoch": 2.638059701492537, | |
| "grad_norm": 0.1994662582874298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5184328556060791, | |
| "mean_token_accuracy": 0.79111048579216, | |
| "num_tokens": 11558204.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5288708806037903, | |
| "epoch": 2.6417910447761193, | |
| "grad_norm": 0.16373923420906067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213331580162048, | |
| "mean_token_accuracy": 0.7881525307893753, | |
| "num_tokens": 11574434.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5072719901800156, | |
| "epoch": 2.6455223880597014, | |
| "grad_norm": 0.1917801946401596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.509112536907196, | |
| "mean_token_accuracy": 0.7960505336523056, | |
| "num_tokens": 11590646.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5356978923082352, | |
| "epoch": 2.6492537313432836, | |
| "grad_norm": 0.19294337928295135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388337969779968, | |
| "mean_token_accuracy": 0.7824567407369614, | |
| "num_tokens": 11606979.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5163687542080879, | |
| "epoch": 2.6529850746268657, | |
| "grad_norm": 0.1852083057165146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158357620239258, | |
| "mean_token_accuracy": 0.7907344847917557, | |
| "num_tokens": 11623404.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5283653736114502, | |
| "epoch": 2.656716417910448, | |
| "grad_norm": 0.17565470933914185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322569608688354, | |
| "mean_token_accuracy": 0.7860839515924454, | |
| "num_tokens": 11639756.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5301189422607422, | |
| "epoch": 2.66044776119403, | |
| "grad_norm": 0.18470223248004913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344855785369873, | |
| "mean_token_accuracy": 0.7831524461507797, | |
| "num_tokens": 11656115.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5131835639476776, | |
| "epoch": 2.664179104477612, | |
| "grad_norm": 0.14412830770015717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5086023211479187, | |
| "mean_token_accuracy": 0.7938779592514038, | |
| "num_tokens": 11672197.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5248347520828247, | |
| "epoch": 2.667910447761194, | |
| "grad_norm": 0.1623944342136383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236642360687256, | |
| "mean_token_accuracy": 0.78847536444664, | |
| "num_tokens": 11688778.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5317736268043518, | |
| "epoch": 2.671641791044776, | |
| "grad_norm": 0.17043523490428925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294151306152344, | |
| "mean_token_accuracy": 0.7867350727319717, | |
| "num_tokens": 11704972.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5292799472808838, | |
| "epoch": 2.675373134328358, | |
| "grad_norm": 0.21420958638191223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348944664001465, | |
| "mean_token_accuracy": 0.784217044711113, | |
| "num_tokens": 11721357.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.513471245765686, | |
| "epoch": 2.6791044776119404, | |
| "grad_norm": 0.18216556310653687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5178148746490479, | |
| "mean_token_accuracy": 0.7881872206926346, | |
| "num_tokens": 11737640.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5091867446899414, | |
| "epoch": 2.6828358208955225, | |
| "grad_norm": 0.18353325128555298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.509505033493042, | |
| "mean_token_accuracy": 0.7933301627635956, | |
| "num_tokens": 11753743.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.4985937625169754, | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.17763254046440125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5041629076004028, | |
| "mean_token_accuracy": 0.7961723208427429, | |
| "num_tokens": 11769941.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5326617211103439, | |
| "epoch": 2.6902985074626864, | |
| "grad_norm": 0.17128810286521912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273231863975525, | |
| "mean_token_accuracy": 0.7882279455661774, | |
| "num_tokens": 11786468.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5309469103813171, | |
| "epoch": 2.6940298507462686, | |
| "grad_norm": 0.16436029970645905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328190326690674, | |
| "mean_token_accuracy": 0.7852970659732819, | |
| "num_tokens": 11802907.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5232216566801071, | |
| "epoch": 2.6977611940298507, | |
| "grad_norm": 0.16719315946102142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230921506881714, | |
| "mean_token_accuracy": 0.7876270413398743, | |
| "num_tokens": 11819317.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5203052535653114, | |
| "epoch": 2.701492537313433, | |
| "grad_norm": 0.19284284114837646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5245278477668762, | |
| "mean_token_accuracy": 0.7879077643156052, | |
| "num_tokens": 11835688.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5309562981128693, | |
| "epoch": 2.705223880597015, | |
| "grad_norm": 0.237013041973114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299087166786194, | |
| "mean_token_accuracy": 0.7888383269309998, | |
| "num_tokens": 11851919.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5239868611097336, | |
| "epoch": 2.708955223880597, | |
| "grad_norm": 0.1684781163930893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5212418437004089, | |
| "mean_token_accuracy": 0.7896943688392639, | |
| "num_tokens": 11868352.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5078758075833321, | |
| "epoch": 2.7126865671641793, | |
| "grad_norm": 0.18132759630680084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5123098492622375, | |
| "mean_token_accuracy": 0.7928104400634766, | |
| "num_tokens": 11884504.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5257874876260757, | |
| "epoch": 2.716417910447761, | |
| "grad_norm": 0.18958209455013275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350735783576965, | |
| "mean_token_accuracy": 0.7816809117794037, | |
| "num_tokens": 11900762.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5237897783517838, | |
| "epoch": 2.720149253731343, | |
| "grad_norm": 0.17628394067287445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271024107933044, | |
| "mean_token_accuracy": 0.7875955998897552, | |
| "num_tokens": 11917096.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5278095304965973, | |
| "epoch": 2.7238805970149254, | |
| "grad_norm": 0.1737760603427887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236294865608215, | |
| "mean_token_accuracy": 0.7871440947055817, | |
| "num_tokens": 11933442.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5360710769891739, | |
| "epoch": 2.7276119402985075, | |
| "grad_norm": 0.17106162011623383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306381583213806, | |
| "mean_token_accuracy": 0.7830738425254822, | |
| "num_tokens": 11949977.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5101736485958099, | |
| "epoch": 2.7313432835820897, | |
| "grad_norm": 0.17468304932117462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146869421005249, | |
| "mean_token_accuracy": 0.7935636639595032, | |
| "num_tokens": 11966192.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5177389085292816, | |
| "epoch": 2.7350746268656714, | |
| "grad_norm": 0.18631240725517273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5224716663360596, | |
| "mean_token_accuracy": 0.78856061398983, | |
| "num_tokens": 11982767.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5130163431167603, | |
| "epoch": 2.7388059701492535, | |
| "grad_norm": 0.18318809568881989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5186882019042969, | |
| "mean_token_accuracy": 0.7916167229413986, | |
| "num_tokens": 11998980.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5177224427461624, | |
| "epoch": 2.7425373134328357, | |
| "grad_norm": 0.15900187194347382, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5131608843803406, | |
| "mean_token_accuracy": 0.7938690781593323, | |
| "num_tokens": 12015535.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.526519387960434, | |
| "epoch": 2.746268656716418, | |
| "grad_norm": 0.174263134598732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261813402175903, | |
| "mean_token_accuracy": 0.7892861515283585, | |
| "num_tokens": 12031788.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5191493332386017, | |
| "epoch": 2.75, | |
| "grad_norm": 0.18909449875354767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240525007247925, | |
| "mean_token_accuracy": 0.7878368943929672, | |
| "num_tokens": 12047980.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5201373547315598, | |
| "epoch": 2.753731343283582, | |
| "grad_norm": 0.18388764560222626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292187929153442, | |
| "mean_token_accuracy": 0.7905917465686798, | |
| "num_tokens": 12064314.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5199328809976578, | |
| "epoch": 2.7574626865671643, | |
| "grad_norm": 0.19509336352348328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5188801884651184, | |
| "mean_token_accuracy": 0.7895538657903671, | |
| "num_tokens": 12080751.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5277723222970963, | |
| "epoch": 2.7611940298507465, | |
| "grad_norm": 0.16337504982948303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206757187843323, | |
| "mean_token_accuracy": 0.7895227074623108, | |
| "num_tokens": 12097014.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5113491863012314, | |
| "epoch": 2.7649253731343286, | |
| "grad_norm": 0.17909789085388184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5122904777526855, | |
| "mean_token_accuracy": 0.7908981740474701, | |
| "num_tokens": 12113252.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5200309902429581, | |
| "epoch": 2.7686567164179103, | |
| "grad_norm": 0.17350299656391144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194863677024841, | |
| "mean_token_accuracy": 0.7900390475988388, | |
| "num_tokens": 12129709.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5226462483406067, | |
| "epoch": 2.7723880597014925, | |
| "grad_norm": 0.21633893251419067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241018533706665, | |
| "mean_token_accuracy": 0.7901509553194046, | |
| "num_tokens": 12146084.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5130392387509346, | |
| "epoch": 2.7761194029850746, | |
| "grad_norm": 0.19013682007789612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189740061759949, | |
| "mean_token_accuracy": 0.7909031510353088, | |
| "num_tokens": 12162307.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5150926038622856, | |
| "epoch": 2.779850746268657, | |
| "grad_norm": 0.2071346938610077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5166252255439758, | |
| "mean_token_accuracy": 0.7929645031690598, | |
| "num_tokens": 12178654.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5175644010305405, | |
| "epoch": 2.783582089552239, | |
| "grad_norm": 0.1927538812160492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234126448631287, | |
| "mean_token_accuracy": 0.7895888537168503, | |
| "num_tokens": 12194657.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5124155282974243, | |
| "epoch": 2.7873134328358207, | |
| "grad_norm": 0.20746196806430817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5111269950866699, | |
| "mean_token_accuracy": 0.7925330102443695, | |
| "num_tokens": 12211150.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5269140601158142, | |
| "epoch": 2.791044776119403, | |
| "grad_norm": 0.16280147433280945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5249094367027283, | |
| "mean_token_accuracy": 0.7845876812934875, | |
| "num_tokens": 12227551.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5178611427545547, | |
| "epoch": 2.794776119402985, | |
| "grad_norm": 0.23840144276618958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257112383842468, | |
| "mean_token_accuracy": 0.7894743531942368, | |
| "num_tokens": 12243876.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5116888880729675, | |
| "epoch": 2.798507462686567, | |
| "grad_norm": 0.18411816656589508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5144840478897095, | |
| "mean_token_accuracy": 0.7931785434484482, | |
| "num_tokens": 12260217.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5289624482393265, | |
| "epoch": 2.8022388059701493, | |
| "grad_norm": 0.22270359098911285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311276316642761, | |
| "mean_token_accuracy": 0.7855756431818008, | |
| "num_tokens": 12276532.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.547882929444313, | |
| "epoch": 2.8059701492537314, | |
| "grad_norm": 0.15829682350158691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395496487617493, | |
| "mean_token_accuracy": 0.7822854816913605, | |
| "num_tokens": 12292809.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5366968065500259, | |
| "epoch": 2.8097014925373136, | |
| "grad_norm": 0.17022006213665009, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253041982650757, | |
| "mean_token_accuracy": 0.7889240682125092, | |
| "num_tokens": 12309272.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5104647874832153, | |
| "epoch": 2.8134328358208958, | |
| "grad_norm": 0.20047977566719055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5114369988441467, | |
| "mean_token_accuracy": 0.7932160943746567, | |
| "num_tokens": 12325725.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.530600056052208, | |
| "epoch": 2.8171641791044775, | |
| "grad_norm": 0.18938857316970825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256994366645813, | |
| "mean_token_accuracy": 0.787563219666481, | |
| "num_tokens": 12341933.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5128819495439529, | |
| "epoch": 2.8208955223880596, | |
| "grad_norm": 0.19077159464359283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5233974456787109, | |
| "mean_token_accuracy": 0.7869286239147186, | |
| "num_tokens": 12358445.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5205030888319016, | |
| "epoch": 2.824626865671642, | |
| "grad_norm": 0.2066243290901184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527535617351532, | |
| "mean_token_accuracy": 0.7873703986406326, | |
| "num_tokens": 12374542.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5135227516293526, | |
| "epoch": 2.828358208955224, | |
| "grad_norm": 0.20685350894927979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181005597114563, | |
| "mean_token_accuracy": 0.7896196097135544, | |
| "num_tokens": 12390788.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5336467772722244, | |
| "epoch": 2.832089552238806, | |
| "grad_norm": 0.1939532607793808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294384956359863, | |
| "mean_token_accuracy": 0.7889339476823807, | |
| "num_tokens": 12407229.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5257266908884048, | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 0.1771981567144394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216140151023865, | |
| "mean_token_accuracy": 0.7899226099252701, | |
| "num_tokens": 12423846.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5299984812736511, | |
| "epoch": 2.83955223880597, | |
| "grad_norm": 0.20455680787563324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296297073364258, | |
| "mean_token_accuracy": 0.7862879633903503, | |
| "num_tokens": 12440158.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5143841132521629, | |
| "epoch": 2.843283582089552, | |
| "grad_norm": 0.2076958268880844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5176342725753784, | |
| "mean_token_accuracy": 0.7894581258296967, | |
| "num_tokens": 12456654.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.4974513649940491, | |
| "epoch": 2.8470149253731343, | |
| "grad_norm": 0.193134143948555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5035260319709778, | |
| "mean_token_accuracy": 0.7979147285223007, | |
| "num_tokens": 12472987.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.516231395304203, | |
| "epoch": 2.8507462686567164, | |
| "grad_norm": 0.19579733908176422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523535430431366, | |
| "mean_token_accuracy": 0.7885937541723251, | |
| "num_tokens": 12489201.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5090928375720978, | |
| "epoch": 2.8544776119402986, | |
| "grad_norm": 0.1745532602071762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5120922327041626, | |
| "mean_token_accuracy": 0.7926068156957626, | |
| "num_tokens": 12505297.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5212984532117844, | |
| "epoch": 2.8582089552238807, | |
| "grad_norm": 0.1687193065881729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5186242461204529, | |
| "mean_token_accuracy": 0.7898098975419998, | |
| "num_tokens": 12521805.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5455201715230942, | |
| "epoch": 2.861940298507463, | |
| "grad_norm": 0.14300285279750824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431771278381348, | |
| "mean_token_accuracy": 0.7779514342546463, | |
| "num_tokens": 12538465.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5209106504917145, | |
| "epoch": 2.8656716417910446, | |
| "grad_norm": 0.16800960898399353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5184243321418762, | |
| "mean_token_accuracy": 0.7890264838933945, | |
| "num_tokens": 12554886.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5088474899530411, | |
| "epoch": 2.8694029850746268, | |
| "grad_norm": 0.1462314873933792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5083324909210205, | |
| "mean_token_accuracy": 0.7934228926897049, | |
| "num_tokens": 12571276.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5271053463220596, | |
| "epoch": 2.873134328358209, | |
| "grad_norm": 0.16391947865486145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293073654174805, | |
| "mean_token_accuracy": 0.7859203815460205, | |
| "num_tokens": 12587621.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5014189630746841, | |
| "epoch": 2.876865671641791, | |
| "grad_norm": 0.16328679025173187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5073498487472534, | |
| "mean_token_accuracy": 0.7924041301012039, | |
| "num_tokens": 12604113.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5268891751766205, | |
| "epoch": 2.8805970149253732, | |
| "grad_norm": 0.21644122898578644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315952301025391, | |
| "mean_token_accuracy": 0.7878720760345459, | |
| "num_tokens": 12620599.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5303193777799606, | |
| "epoch": 2.8843283582089554, | |
| "grad_norm": 0.16348110139369965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203503966331482, | |
| "mean_token_accuracy": 0.7895929515361786, | |
| "num_tokens": 12636920.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5373167991638184, | |
| "epoch": 2.888059701492537, | |
| "grad_norm": 0.1674329936504364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308367609977722, | |
| "mean_token_accuracy": 0.7839034348726273, | |
| "num_tokens": 12653507.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5245395004749298, | |
| "epoch": 2.8917910447761193, | |
| "grad_norm": 0.16798977553844452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.525133490562439, | |
| "mean_token_accuracy": 0.7879597991704941, | |
| "num_tokens": 12669748.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.4995606988668442, | |
| "epoch": 2.8955223880597014, | |
| "grad_norm": 0.16923899948596954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5072147250175476, | |
| "mean_token_accuracy": 0.7954233735799789, | |
| "num_tokens": 12686075.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5168571919202805, | |
| "epoch": 2.8992537313432836, | |
| "grad_norm": 0.19585320353507996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531486988067627, | |
| "mean_token_accuracy": 0.786114364862442, | |
| "num_tokens": 12702228.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5194735378026962, | |
| "epoch": 2.9029850746268657, | |
| "grad_norm": 0.17308996617794037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222083926200867, | |
| "mean_token_accuracy": 0.7887429147958755, | |
| "num_tokens": 12718513.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5187652111053467, | |
| "epoch": 2.906716417910448, | |
| "grad_norm": 0.18012917041778564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5144599676132202, | |
| "mean_token_accuracy": 0.7928689271211624, | |
| "num_tokens": 12734912.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5175924748182297, | |
| "epoch": 2.91044776119403, | |
| "grad_norm": 0.15708911418914795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5127027034759521, | |
| "mean_token_accuracy": 0.7910457104444504, | |
| "num_tokens": 12751312.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5184929892420769, | |
| "epoch": 2.914179104477612, | |
| "grad_norm": 0.17460955679416656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223311185836792, | |
| "mean_token_accuracy": 0.7881267666816711, | |
| "num_tokens": 12767906.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5162710845470428, | |
| "epoch": 2.917910447761194, | |
| "grad_norm": 0.1744503378868103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5184698104858398, | |
| "mean_token_accuracy": 0.7896480411291122, | |
| "num_tokens": 12784363.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5054134130477905, | |
| "epoch": 2.921641791044776, | |
| "grad_norm": 0.16419187188148499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5100088715553284, | |
| "mean_token_accuracy": 0.7937912940979004, | |
| "num_tokens": 12800729.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5267587229609489, | |
| "epoch": 2.925373134328358, | |
| "grad_norm": 0.15712794661521912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234281420707703, | |
| "mean_token_accuracy": 0.7873355746269226, | |
| "num_tokens": 12817275.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5252643376588821, | |
| "epoch": 2.9291044776119404, | |
| "grad_norm": 0.17461742460727692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5149291753768921, | |
| "mean_token_accuracy": 0.792007714509964, | |
| "num_tokens": 12833722.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5310375690460205, | |
| "epoch": 2.9328358208955225, | |
| "grad_norm": 0.16197697818279266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280002355575562, | |
| "mean_token_accuracy": 0.7869867831468582, | |
| "num_tokens": 12850311.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5165882706642151, | |
| "epoch": 2.9365671641791042, | |
| "grad_norm": 0.18169313669204712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169544219970703, | |
| "mean_token_accuracy": 0.7926650643348694, | |
| "num_tokens": 12866551.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.506410725414753, | |
| "epoch": 2.9402985074626864, | |
| "grad_norm": 0.16465988755226135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5119289755821228, | |
| "mean_token_accuracy": 0.7941572368144989, | |
| "num_tokens": 12882861.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5014762431383133, | |
| "epoch": 2.9440298507462686, | |
| "grad_norm": 0.18377594649791718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5110628008842468, | |
| "mean_token_accuracy": 0.7946459800004959, | |
| "num_tokens": 12899241.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5248052775859833, | |
| "epoch": 2.9477611940298507, | |
| "grad_norm": 0.20053857564926147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319278240203857, | |
| "mean_token_accuracy": 0.7844424396753311, | |
| "num_tokens": 12915385.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.53006511926651, | |
| "epoch": 2.951492537313433, | |
| "grad_norm": 0.17584678530693054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255709886550903, | |
| "mean_token_accuracy": 0.7863388210535049, | |
| "num_tokens": 12931592.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5275840014219284, | |
| "epoch": 2.955223880597015, | |
| "grad_norm": 0.17536833882331848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213799476623535, | |
| "mean_token_accuracy": 0.7920176684856415, | |
| "num_tokens": 12948004.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5442412495613098, | |
| "epoch": 2.958955223880597, | |
| "grad_norm": 0.17195221781730652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382991433143616, | |
| "mean_token_accuracy": 0.7807125151157379, | |
| "num_tokens": 12964350.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.514294296503067, | |
| "epoch": 2.9626865671641793, | |
| "grad_norm": 0.1958279013633728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5191056728363037, | |
| "mean_token_accuracy": 0.7889736741781235, | |
| "num_tokens": 12980870.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.516971156001091, | |
| "epoch": 2.966417910447761, | |
| "grad_norm": 0.17031143605709076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235239863395691, | |
| "mean_token_accuracy": 0.7902554422616959, | |
| "num_tokens": 12997265.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.519709937274456, | |
| "epoch": 2.970149253731343, | |
| "grad_norm": 0.19241590797901154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290430188179016, | |
| "mean_token_accuracy": 0.786635085940361, | |
| "num_tokens": 13013641.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5278842747211456, | |
| "epoch": 2.9738805970149254, | |
| "grad_norm": 0.1847175806760788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301830768585205, | |
| "mean_token_accuracy": 0.7861872166395187, | |
| "num_tokens": 13030089.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.543852686882019, | |
| "epoch": 2.9776119402985075, | |
| "grad_norm": 0.1565551459789276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390616655349731, | |
| "mean_token_accuracy": 0.7804800420999527, | |
| "num_tokens": 13046782.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5507520437240601, | |
| "epoch": 2.9813432835820897, | |
| "grad_norm": 0.19360534846782684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457417964935303, | |
| "mean_token_accuracy": 0.7808282524347305, | |
| "num_tokens": 13063260.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5130215361714363, | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.17565752565860748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5124551057815552, | |
| "mean_token_accuracy": 0.7940163463354111, | |
| "num_tokens": 13079496.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5296107679605484, | |
| "epoch": 2.9888059701492535, | |
| "grad_norm": 0.18528884649276733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258690714836121, | |
| "mean_token_accuracy": 0.7890074849128723, | |
| "num_tokens": 13095995.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5083938241004944, | |
| "epoch": 2.9925373134328357, | |
| "grad_norm": 0.17645564675331116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169539451599121, | |
| "mean_token_accuracy": 0.7913031429052353, | |
| "num_tokens": 13112668.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5120368450880051, | |
| "epoch": 2.996268656716418, | |
| "grad_norm": 0.1844874620437622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5195419192314148, | |
| "mean_token_accuracy": 0.7927880436182022, | |
| "num_tokens": 13128901.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5261139273643494, | |
| "epoch": 3.0, | |
| "grad_norm": 0.19706764817237854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334464311599731, | |
| "mean_token_accuracy": 0.7812356650829315, | |
| "num_tokens": 13145317.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2252935644732457e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |