| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.3738317757009346, |
| "eval_steps": 100, |
| "global_step": 700, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.1900819569826127, |
| "epoch": 0.033984706881903144, |
| "grad_norm": 0.107421875, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 1.2022, |
| "mean_token_accuracy": 0.7261606067419052, |
| "num_tokens": 32329.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.1123934835195541, |
| "epoch": 0.06796941376380629, |
| "grad_norm": 0.138671875, |
| "learning_rate": 0.00014074074074074076, |
| "loss": 1.1277, |
| "mean_token_accuracy": 0.7318252056837082, |
| "num_tokens": 67111.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.0879137217998505, |
| "epoch": 0.10195412064570943, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.00019999731865174213, |
| "loss": 1.0625, |
| "mean_token_accuracy": 0.7478782564401627, |
| "num_tokens": 101352.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.9986169628798962, |
| "epoch": 0.13593882752761258, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.00019990348656007261, |
| "loss": 1.0079, |
| "mean_token_accuracy": 0.7581020414829254, |
| "num_tokens": 133883.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.05185257345438, |
| "epoch": 0.16992353440951571, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.00019967573081342103, |
| "loss": 1.0335, |
| "mean_token_accuracy": 0.7543211042881012, |
| "num_tokens": 168562.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.9918051429092885, |
| "epoch": 0.20390824129141885, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.00019931435672527624, |
| "loss": 0.9977, |
| "mean_token_accuracy": 0.755571472644806, |
| "num_tokens": 202299.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8909615643322468, |
| "epoch": 0.23789294817332202, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.00019881984872856817, |
| "loss": 0.9168, |
| "mean_token_accuracy": 0.7724857151508331, |
| "num_tokens": 237132.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.9534411959350109, |
| "epoch": 0.27187765505522515, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.00019819286972627066, |
| "loss": 0.9673, |
| "mean_token_accuracy": 0.7637621074914932, |
| "num_tokens": 271644.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.9248364262282849, |
| "epoch": 0.3058623619371283, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.00019743426020275994, |
| "loss": 0.9077, |
| "mean_token_accuracy": 0.7676862999796867, |
| "num_tokens": 306939.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.0025891564786433, |
| "epoch": 0.33984706881903143, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.00019654503709711982, |
| "loss": 1.0295, |
| "mean_token_accuracy": 0.7536625176668167, |
| "num_tokens": 341079.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.33984706881903143, |
| "eval_entropy": 0.9100993389175052, |
| "eval_loss": 0.9260265231132507, |
| "eval_mean_token_accuracy": 0.7760482584538103, |
| "eval_num_tokens": 341079.0, |
| "eval_runtime": 140.9825, |
| "eval_samples_per_second": 1.043, |
| "eval_steps_per_second": 1.043, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.9492803812026978, |
| "epoch": 0.37383177570093457, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.00019552639243990402, |
| "loss": 0.9391, |
| "mean_token_accuracy": 0.7675268396735191, |
| "num_tokens": 374431.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.86210857629776, |
| "epoch": 0.4078164825828377, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.00019437969175518295, |
| "loss": 0.8868, |
| "mean_token_accuracy": 0.7805617764592171, |
| "num_tokens": 410401.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9226490631699562, |
| "epoch": 0.44180118946474084, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.0001931064722300175, |
| "loss": 0.9118, |
| "mean_token_accuracy": 0.7770121544599533, |
| "num_tokens": 443280.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.8999310165643692, |
| "epoch": 0.47578589634664403, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00019170844065381285, |
| "loss": 0.9117, |
| "mean_token_accuracy": 0.7742907002568244, |
| "num_tokens": 478900.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.8831319987773896, |
| "epoch": 0.5097706032285472, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.00019018747113031564, |
| "loss": 0.8893, |
| "mean_token_accuracy": 0.7864658862352372, |
| "num_tokens": 513687.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.8636823572218418, |
| "epoch": 0.5437553101104503, |
| "grad_norm": 0.490234375, |
| "learning_rate": 0.000188545602565321, |
| "loss": 0.8824, |
| "mean_token_accuracy": 0.7811477512121201, |
| "num_tokens": 549223.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.9621140897274018, |
| "epoch": 0.5777400169923534, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.00018678503593345754, |
| "loss": 0.959, |
| "mean_token_accuracy": 0.765991534292698, |
| "num_tokens": 582348.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9770274326205254, |
| "epoch": 0.6117247238742566, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.00018490813132771393, |
| "loss": 0.9698, |
| "mean_token_accuracy": 0.767307311296463, |
| "num_tokens": 614105.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.8605956405401229, |
| "epoch": 0.6457094307561597, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.00018291740479566283, |
| "loss": 0.8559, |
| "mean_token_accuracy": 0.7881909653544426, |
| "num_tokens": 649045.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.8583430543541908, |
| "epoch": 0.6796941376380629, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.00018081552496662258, |
| "loss": 0.8807, |
| "mean_token_accuracy": 0.7877280384302139, |
| "num_tokens": 681327.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6796941376380629, |
| "eval_entropy": 0.896664229785504, |
| "eval_loss": 0.89118492603302, |
| "eval_mean_token_accuracy": 0.7833221583139329, |
| "eval_num_tokens": 681327.0, |
| "eval_runtime": 140.9445, |
| "eval_samples_per_second": 1.043, |
| "eval_steps_per_second": 1.043, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.9148187682032585, |
| "epoch": 0.713678844519966, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.00017860530947427875, |
| "loss": 0.9274, |
| "mean_token_accuracy": 0.7749700739979744, |
| "num_tokens": 714260.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.923950819671154, |
| "epoch": 0.7476635514018691, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.0001762897211795607, |
| "loss": 0.8731, |
| "mean_token_accuracy": 0.7750329807400703, |
| "num_tokens": 746221.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.0102886088192462, |
| "epoch": 0.7816482582837723, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.0001738718641988365, |
| "loss": 1.0153, |
| "mean_token_accuracy": 0.7520556971430779, |
| "num_tokens": 781925.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.8034560449421406, |
| "epoch": 0.8156329651656754, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.00017135497974275088, |
| "loss": 0.8024, |
| "mean_token_accuracy": 0.8065776988863945, |
| "num_tokens": 819257.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.8587650842964649, |
| "epoch": 0.8496176720475785, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.00016874244177128396, |
| "loss": 0.8729, |
| "mean_token_accuracy": 0.7839998126029968, |
| "num_tokens": 854177.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.8471586465835571, |
| "epoch": 0.8836023789294817, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.00016603775247085546, |
| "loss": 0.8693, |
| "mean_token_accuracy": 0.7884811326861382, |
| "num_tokens": 887426.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.8740401305258274, |
| "epoch": 0.9175870858113849, |
| "grad_norm": 0.3125, |
| "learning_rate": 0.00016324453755953773, |
| "loss": 0.881, |
| "mean_token_accuracy": 0.787057913839817, |
| "num_tokens": 918169.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.8283644251525402, |
| "epoch": 0.9515717926932881, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.00016036654142667043, |
| "loss": 0.8293, |
| "mean_token_accuracy": 0.7984860435128212, |
| "num_tokens": 953966.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.8554800219833851, |
| "epoch": 0.9855564995751912, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.00015740762211339314, |
| "loss": 0.8646, |
| "mean_token_accuracy": 0.7841584324836731, |
| "num_tokens": 986605.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.8761118473233404, |
| "epoch": 1.0169923534409515, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.00015437174614082416, |
| "loss": 0.9419, |
| "mean_token_accuracy": 0.7891149762514476, |
| "num_tokens": 1018372.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0169923534409515, |
| "eval_entropy": 0.8560160027474774, |
| "eval_loss": 0.8672717809677124, |
| "eval_mean_token_accuracy": 0.7890248249988167, |
| "eval_num_tokens": 1018372.0, |
| "eval_runtime": 140.8836, |
| "eval_samples_per_second": 1.043, |
| "eval_steps_per_second": 1.043, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.835958081483841, |
| "epoch": 1.0509770603228548, |
| "grad_norm": 0.3515625, |
| "learning_rate": 0.00015126298319281857, |
| "loss": 0.8349, |
| "mean_token_accuracy": 0.7921741649508476, |
| "num_tokens": 1053253.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.8159842237830162, |
| "epoch": 1.0849617672047578, |
| "grad_norm": 0.46875, |
| "learning_rate": 0.00014808550066043352, |
| "loss": 0.8036, |
| "mean_token_accuracy": 0.7972497373819352, |
| "num_tokens": 1087084.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.8521868549287319, |
| "epoch": 1.118946474086661, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.00014484355805541413, |
| "loss": 0.9106, |
| "mean_token_accuracy": 0.7848088085651398, |
| "num_tokens": 1121833.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.8146591357886791, |
| "epoch": 1.152931180968564, |
| "grad_norm": 0.412109375, |
| "learning_rate": 0.00014154150130018866, |
| "loss": 0.7855, |
| "mean_token_accuracy": 0.7983845800161362, |
| "num_tokens": 1155315.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.8469755969941616, |
| "epoch": 1.1869158878504673, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.00013818375690202774, |
| "loss": 0.8677, |
| "mean_token_accuracy": 0.7855840787291527, |
| "num_tokens": 1188884.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.8589501097798348, |
| "epoch": 1.2209005947323703, |
| "grad_norm": 0.38671875, |
| "learning_rate": 0.000134774826019177, |
| "loss": 0.8452, |
| "mean_token_accuracy": 0.786843791604042, |
| "num_tokens": 1223117.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8148165218532085, |
| "epoch": 1.2548853016142736, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.0001313192784269179, |
| "loss": 0.7977, |
| "mean_token_accuracy": 0.7989379152655601, |
| "num_tokens": 1257970.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.8345869470387697, |
| "epoch": 1.2888700084961768, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.0001278217463916453, |
| "loss": 0.8308, |
| "mean_token_accuracy": 0.7944282591342926, |
| "num_tokens": 1294426.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.8046329416334629, |
| "epoch": 1.3228547153780799, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.00012428691846117372, |
| "loss": 0.7869, |
| "mean_token_accuracy": 0.8005647033452987, |
| "num_tokens": 1327540.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.8379782371222972, |
| "epoch": 1.3568394222599829, |
| "grad_norm": 0.4375, |
| "learning_rate": 0.00012071953317959692, |
| "loss": 0.8281, |
| "mean_token_accuracy": 0.7926659971475601, |
| "num_tokens": 1361518.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3568394222599829, |
| "eval_entropy": 0.8221898901016533, |
| "eval_loss": 0.8522771000862122, |
| "eval_mean_token_accuracy": 0.7922491832655303, |
| "eval_num_tokens": 1361518.0, |
| "eval_runtime": 140.9731, |
| "eval_samples_per_second": 1.043, |
| "eval_steps_per_second": 1.043, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.8128557071089745, |
| "epoch": 1.3908241291418861, |
| "grad_norm": 0.376953125, |
| "learning_rate": 0.00011712437273512561, |
| "loss": 0.7984, |
| "mean_token_accuracy": 0.7952910155057907, |
| "num_tokens": 1395450.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.7772056467831134, |
| "epoch": 1.4248088360237894, |
| "grad_norm": 0.388671875, |
| "learning_rate": 0.00011350625654941918, |
| "loss": 0.7676, |
| "mean_token_accuracy": 0.8111284270882606, |
| "num_tokens": 1429377.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.77816668972373, |
| "epoch": 1.4587935429056924, |
| "grad_norm": 0.357421875, |
| "learning_rate": 0.00010987003481700455, |
| "loss": 0.7709, |
| "mean_token_accuracy": 0.8049451917409897, |
| "num_tokens": 1462442.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.8393321461975575, |
| "epoch": 1.4927782497875957, |
| "grad_norm": 0.392578125, |
| "learning_rate": 0.00010622058200344344, |
| "loss": 0.8423, |
| "mean_token_accuracy": 0.7850104674696923, |
| "num_tokens": 1497205.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.8927516497671604, |
| "epoch": 1.5267629566694987, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.00010256279031096328, |
| "loss": 0.8664, |
| "mean_token_accuracy": 0.7828694671392441, |
| "num_tokens": 1531243.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.7833915807306766, |
| "epoch": 1.560747663551402, |
| "grad_norm": 0.466796875, |
| "learning_rate": 9.890156312031163e-05, |
| "loss": 0.7676, |
| "mean_token_accuracy": 0.8062808975577355, |
| "num_tokens": 1565238.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.8227095231413841, |
| "epoch": 1.594732370433305, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.524180841762577e-05, |
| "loss": 0.8238, |
| "mean_token_accuracy": 0.796034836769104, |
| "num_tokens": 1599147.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.7979710936546326, |
| "epoch": 1.6287170773152082, |
| "grad_norm": 0.365234375, |
| "learning_rate": 9.1588432215128e-05, |
| "loss": 0.782, |
| "mean_token_accuracy": 0.8032634913921356, |
| "num_tokens": 1632572.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.7890071399509907, |
| "epoch": 1.6627017841971115, |
| "grad_norm": 0.5, |
| "learning_rate": 8.79463319744677e-05, |
| "loss": 0.8058, |
| "mean_token_accuracy": 0.8048980295658111, |
| "num_tokens": 1664018.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8366815261542797, |
| "epoch": 1.6966864910790145, |
| "grad_norm": 0.34375, |
| "learning_rate": 8.432039004152519e-05, |
| "loss": 0.8222, |
| "mean_token_accuracy": 0.7975986883044243, |
| "num_tokens": 1698322.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6966864910790145, |
| "eval_entropy": 0.8350493262211481, |
| "eval_loss": 0.8397489786148071, |
| "eval_mean_token_accuracy": 0.7947951194380416, |
| "eval_num_tokens": 1698322.0, |
| "eval_runtime": 140.612, |
| "eval_samples_per_second": 1.045, |
| "eval_steps_per_second": 1.045, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.7970431953668594, |
| "epoch": 1.7306711979609175, |
| "grad_norm": 0.5546875, |
| "learning_rate": 8.071546710147911e-05, |
| "loss": 0.7672, |
| "mean_token_accuracy": 0.8059538155794144, |
| "num_tokens": 1732641.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.8044024340808391, |
| "epoch": 1.7646559048428208, |
| "grad_norm": 0.470703125, |
| "learning_rate": 7.713639566291027e-05, |
| "loss": 0.8155, |
| "mean_token_accuracy": 0.7996707066893578, |
| "num_tokens": 1766698.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.7848532184958458, |
| "epoch": 1.798640611724724, |
| "grad_norm": 0.390625, |
| "learning_rate": 7.358797357967749e-05, |
| "loss": 0.7718, |
| "mean_token_accuracy": 0.8004196003079415, |
| "num_tokens": 1798260.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.8484033491462469, |
| "epoch": 1.832625318606627, |
| "grad_norm": 0.44140625, |
| "learning_rate": 7.007495761924862e-05, |
| "loss": 0.8354, |
| "mean_token_accuracy": 0.7922118753194809, |
| "num_tokens": 1834411.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.8166721411049366, |
| "epoch": 1.86661002548853, |
| "grad_norm": 0.390625, |
| "learning_rate": 6.660205708610987e-05, |
| "loss": 0.8533, |
| "mean_token_accuracy": 0.7995400235056878, |
| "num_tokens": 1868051.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.7962750904262066, |
| "epoch": 1.9005947323704333, |
| "grad_norm": 0.326171875, |
| "learning_rate": 6.317392750879978e-05, |
| "loss": 0.7929, |
| "mean_token_accuracy": 0.8041162744164467, |
| "num_tokens": 1902715.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.8834932953119278, |
| "epoch": 1.9345794392523366, |
| "grad_norm": 0.416015625, |
| "learning_rate": 5.979516439903221e-05, |
| "loss": 0.8764, |
| "mean_token_accuracy": 0.7862503513693809, |
| "num_tokens": 1934823.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.7844171606004238, |
| "epoch": 1.9685641461342396, |
| "grad_norm": 0.33984375, |
| "learning_rate": 5.647029709127355e-05, |
| "loss": 0.805, |
| "mean_token_accuracy": 0.8080376788973809, |
| "num_tokens": 1970939.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.7350320663001086, |
| "epoch": 2.0, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.3203782671032055e-05, |
| "loss": 0.7487, |
| "mean_token_accuracy": 0.8147893048621513, |
| "num_tokens": 2001880.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.7770363502204418, |
| "epoch": 2.033984706881903, |
| "grad_norm": 0.5078125, |
| "learning_rate": 5.000000000000002e-05, |
| "loss": 0.7848, |
| "mean_token_accuracy": 0.8036531031131744, |
| "num_tokens": 2032980.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.033984706881903, |
| "eval_entropy": 0.7900621154073144, |
| "eval_loss": 0.8340142369270325, |
| "eval_mean_token_accuracy": 0.7969201965396907, |
| "eval_num_tokens": 2032980.0, |
| "eval_runtime": 140.9352, |
| "eval_samples_per_second": 1.043, |
| "eval_steps_per_second": 1.043, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.756380145996809, |
| "epoch": 2.0679694137638065, |
| "grad_norm": 0.458984375, |
| "learning_rate": 4.686324384605629e-05, |
| "loss": 0.7823, |
| "mean_token_accuracy": 0.8089984133839607, |
| "num_tokens": 2067951.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.7689781740307808, |
| "epoch": 2.1019541206457095, |
| "grad_norm": 0.341796875, |
| "learning_rate": 4.3797719126e-05, |
| "loss": 0.7489, |
| "mean_token_accuracy": 0.8059608668088913, |
| "num_tokens": 2102690.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.7978510297834873, |
| "epoch": 2.1359388275276125, |
| "grad_norm": 0.421875, |
| "learning_rate": 4.08075352687318e-05, |
| "loss": 0.7711, |
| "mean_token_accuracy": 0.8028671458363533, |
| "num_tokens": 2136553.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.7843278538435697, |
| "epoch": 2.1699235344095156, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.789670070643982e-05, |
| "loss": 0.7817, |
| "mean_token_accuracy": 0.8063826531171798, |
| "num_tokens": 2170922.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.8048521246761083, |
| "epoch": 2.203908241291419, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.506911750117469e-05, |
| "loss": 0.8171, |
| "mean_token_accuracy": 0.7995030015707016, |
| "num_tokens": 2205971.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.7707572512328624, |
| "epoch": 2.237892948173322, |
| "grad_norm": 0.453125, |
| "learning_rate": 3.232857611401693e-05, |
| "loss": 0.7546, |
| "mean_token_accuracy": 0.8126760870218277, |
| "num_tokens": 2241219.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.868005882203579, |
| "epoch": 2.271877655055225, |
| "grad_norm": 0.466796875, |
| "learning_rate": 2.9678750323848893e-05, |
| "loss": 0.8657, |
| "mean_token_accuracy": 0.7834504991769791, |
| "num_tokens": 2275600.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.7262178905308246, |
| "epoch": 2.305862361937128, |
| "grad_norm": 0.341796875, |
| "learning_rate": 2.71231923025427e-05, |
| "loss": 0.7005, |
| "mean_token_accuracy": 0.821708083152771, |
| "num_tokens": 2310278.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.7482855342328548, |
| "epoch": 2.3398470688190316, |
| "grad_norm": 0.4453125, |
| "learning_rate": 2.4665327853166075e-05, |
| "loss": 0.745, |
| "mean_token_accuracy": 0.8137555688619613, |
| "num_tokens": 2344515.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.7359024606645107, |
| "epoch": 2.3738317757009346, |
| "grad_norm": 0.384765625, |
| "learning_rate": 2.2308451817589283e-05, |
| "loss": 0.7407, |
| "mean_token_accuracy": 0.8139674678444863, |
| "num_tokens": 2378108.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.3738317757009346, |
| "eval_entropy": 0.7895172556646827, |
| "eval_loss": 0.8320378661155701, |
| "eval_mean_token_accuracy": 0.7975128520102728, |
| "eval_num_tokens": 2378108.0, |
| "eval_runtime": 140.9707, |
| "eval_samples_per_second": 1.043, |
| "eval_steps_per_second": 1.043, |
| "step": 700 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 885, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.439848947662848e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|