| { |
| "best_global_step": 500, |
| "best_metric": 0.19536998867988586, |
| "best_model_checkpoint": "outputs/checkpoint-500", |
| "epoch": 2.247191011235955, |
| "eval_steps": 100, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03745318352059925, |
| "grad_norm": 2.6471238136291504, |
| "learning_rate": 1.9865168539325844e-05, |
| "loss": 3.9924, |
| "mean_token_accuracy": 0.3569513201713562, |
| "num_tokens": 1110.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0749063670411985, |
| "grad_norm": 2.9193994998931885, |
| "learning_rate": 1.9715355805243446e-05, |
| "loss": 2.5013, |
| "mean_token_accuracy": 0.5000596195459366, |
| "num_tokens": 2220.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11235955056179775, |
| "grad_norm": 1.090408444404602, |
| "learning_rate": 1.956554307116105e-05, |
| "loss": 1.2021, |
| "mean_token_accuracy": 0.7512393116950988, |
| "num_tokens": 3329.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.149812734082397, |
| "grad_norm": 1.412244200706482, |
| "learning_rate": 1.9415730337078652e-05, |
| "loss": 0.6237, |
| "mean_token_accuracy": 0.8658290803432465, |
| "num_tokens": 4437.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18726591760299627, |
| "grad_norm": 0.9774134755134583, |
| "learning_rate": 1.9265917602996254e-05, |
| "loss": 0.4264, |
| "mean_token_accuracy": 0.9105254471302032, |
| "num_tokens": 5553.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2247191011235955, |
| "grad_norm": 0.6166325211524963, |
| "learning_rate": 1.9116104868913857e-05, |
| "loss": 0.3806, |
| "mean_token_accuracy": 0.8969066739082336, |
| "num_tokens": 6660.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.26217228464419473, |
| "grad_norm": 0.5820680856704712, |
| "learning_rate": 1.8966292134831463e-05, |
| "loss": 0.3484, |
| "mean_token_accuracy": 0.8972096979618073, |
| "num_tokens": 7769.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.299625468164794, |
| "grad_norm": 0.31422552466392517, |
| "learning_rate": 1.8816479400749066e-05, |
| "loss": 0.3196, |
| "mean_token_accuracy": 0.898263669013977, |
| "num_tokens": 8880.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.33707865168539325, |
| "grad_norm": 0.5825852155685425, |
| "learning_rate": 1.866666666666667e-05, |
| "loss": 0.2965, |
| "mean_token_accuracy": 0.9046498596668243, |
| "num_tokens": 9992.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.37453183520599254, |
| "grad_norm": 0.38430944085121155, |
| "learning_rate": 1.851685393258427e-05, |
| "loss": 0.2839, |
| "mean_token_accuracy": 0.9051393151283265, |
| "num_tokens": 11098.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.37453183520599254, |
| "eval_loss": 0.2852214574813843, |
| "eval_mean_token_accuracy": 0.9032742083072662, |
| "eval_num_tokens": 11098.0, |
| "eval_runtime": 2.4929, |
| "eval_samples_per_second": 11.633, |
| "eval_steps_per_second": 1.605, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.41198501872659177, |
| "grad_norm": 0.312187522649765, |
| "learning_rate": 1.8367041198501874e-05, |
| "loss": 0.2752, |
| "mean_token_accuracy": 0.9036725044250489, |
| "num_tokens": 12207.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.449438202247191, |
| "grad_norm": 0.3875369131565094, |
| "learning_rate": 1.8217228464419477e-05, |
| "loss": 0.2659, |
| "mean_token_accuracy": 0.9044483065605163, |
| "num_tokens": 13316.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4868913857677903, |
| "grad_norm": 0.6050882339477539, |
| "learning_rate": 1.8067415730337083e-05, |
| "loss": 0.258, |
| "mean_token_accuracy": 0.9100114285945893, |
| "num_tokens": 14426.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5243445692883895, |
| "grad_norm": 0.5287177562713623, |
| "learning_rate": 1.7917602996254685e-05, |
| "loss": 0.2455, |
| "mean_token_accuracy": 0.9222747385501862, |
| "num_tokens": 15539.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5617977528089888, |
| "grad_norm": 0.5224889516830444, |
| "learning_rate": 1.7767790262172285e-05, |
| "loss": 0.2368, |
| "mean_token_accuracy": 0.9263923704624176, |
| "num_tokens": 16647.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.599250936329588, |
| "grad_norm": 0.4501174986362457, |
| "learning_rate": 1.7617977528089887e-05, |
| "loss": 0.2299, |
| "mean_token_accuracy": 0.9313735246658326, |
| "num_tokens": 17760.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6367041198501873, |
| "grad_norm": 0.43853962421417236, |
| "learning_rate": 1.746816479400749e-05, |
| "loss": 0.2222, |
| "mean_token_accuracy": 0.9402973234653473, |
| "num_tokens": 18869.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6741573033707865, |
| "grad_norm": 0.31908461451530457, |
| "learning_rate": 1.7318352059925093e-05, |
| "loss": 0.2117, |
| "mean_token_accuracy": 0.9458102405071258, |
| "num_tokens": 19977.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7116104868913857, |
| "grad_norm": 0.2825154662132263, |
| "learning_rate": 1.71685393258427e-05, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.938564246892929, |
| "num_tokens": 21088.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7490636704119851, |
| "grad_norm": 0.2939445674419403, |
| "learning_rate": 1.70187265917603e-05, |
| "loss": 0.2051, |
| "mean_token_accuracy": 0.9392363965511322, |
| "num_tokens": 22195.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7490636704119851, |
| "eval_loss": 0.2029074728488922, |
| "eval_mean_token_accuracy": 0.9482556581497192, |
| "eval_num_tokens": 22195.0, |
| "eval_runtime": 2.4927, |
| "eval_samples_per_second": 11.634, |
| "eval_steps_per_second": 1.605, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7865168539325843, |
| "grad_norm": 0.18860529363155365, |
| "learning_rate": 1.6868913857677904e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.9431917011737824, |
| "num_tokens": 23306.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8239700374531835, |
| "grad_norm": 0.22066630423069, |
| "learning_rate": 1.6719101123595507e-05, |
| "loss": 0.2001, |
| "mean_token_accuracy": 0.9430991888046265, |
| "num_tokens": 24417.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8614232209737828, |
| "grad_norm": 0.17636580765247345, |
| "learning_rate": 1.656928838951311e-05, |
| "loss": 0.1968, |
| "mean_token_accuracy": 0.9465341567993164, |
| "num_tokens": 25522.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.898876404494382, |
| "grad_norm": 0.14720433950424194, |
| "learning_rate": 1.6419475655430712e-05, |
| "loss": 0.1982, |
| "mean_token_accuracy": 0.9413078784942627, |
| "num_tokens": 26632.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9363295880149812, |
| "grad_norm": 0.11868773400783539, |
| "learning_rate": 1.626966292134832e-05, |
| "loss": 0.1955, |
| "mean_token_accuracy": 0.9468274474143982, |
| "num_tokens": 27742.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9737827715355806, |
| "grad_norm": 0.14357531070709229, |
| "learning_rate": 1.611985018726592e-05, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9457764148712158, |
| "num_tokens": 28851.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.0112359550561798, |
| "grad_norm": 0.21999526023864746, |
| "learning_rate": 1.5970037453183524e-05, |
| "loss": 0.1966, |
| "mean_token_accuracy": 0.9422410607337952, |
| "num_tokens": 29905.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.048689138576779, |
| "grad_norm": 0.10375912487506866, |
| "learning_rate": 1.5820224719101127e-05, |
| "loss": 0.1935, |
| "mean_token_accuracy": 0.9441024959087372, |
| "num_tokens": 31016.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0861423220973783, |
| "grad_norm": 0.2760375738143921, |
| "learning_rate": 1.5670411985018726e-05, |
| "loss": 0.1947, |
| "mean_token_accuracy": 0.9411046266555786, |
| "num_tokens": 32124.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.1235955056179776, |
| "grad_norm": 0.2127188742160797, |
| "learning_rate": 1.552059925093633e-05, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9514408648014069, |
| "num_tokens": 33234.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1235955056179776, |
| "eval_loss": 0.19752565026283264, |
| "eval_mean_token_accuracy": 0.9434169828891754, |
| "eval_num_tokens": 33234.0, |
| "eval_runtime": 2.4936, |
| "eval_samples_per_second": 11.63, |
| "eval_steps_per_second": 1.604, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1610486891385767, |
| "grad_norm": 0.10267303138971329, |
| "learning_rate": 1.537078651685393e-05, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.9476523637771607, |
| "num_tokens": 34342.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.198501872659176, |
| "grad_norm": 0.23754256963729858, |
| "learning_rate": 1.5220973782771537e-05, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9512970626354218, |
| "num_tokens": 35450.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2359550561797752, |
| "grad_norm": 0.09665194898843765, |
| "learning_rate": 1.507116104868914e-05, |
| "loss": 0.1911, |
| "mean_token_accuracy": 0.9494555711746215, |
| "num_tokens": 36558.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2734082397003745, |
| "grad_norm": 0.11535191535949707, |
| "learning_rate": 1.4921348314606743e-05, |
| "loss": 0.1915, |
| "mean_token_accuracy": 0.9493873059749603, |
| "num_tokens": 37664.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.3108614232209739, |
| "grad_norm": 0.11016673594713211, |
| "learning_rate": 1.4771535580524345e-05, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9440759301185608, |
| "num_tokens": 38774.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.348314606741573, |
| "grad_norm": 0.24848656356334686, |
| "learning_rate": 1.4621722846441948e-05, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9458104014396668, |
| "num_tokens": 39883.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3857677902621723, |
| "grad_norm": 0.1400669664144516, |
| "learning_rate": 1.447191011235955e-05, |
| "loss": 0.1936, |
| "mean_token_accuracy": 0.9457758069038391, |
| "num_tokens": 40990.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.4232209737827715, |
| "grad_norm": 0.1753997802734375, |
| "learning_rate": 1.4322097378277155e-05, |
| "loss": 0.1921, |
| "mean_token_accuracy": 0.9477294445037842, |
| "num_tokens": 42099.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4606741573033708, |
| "grad_norm": 0.11102133989334106, |
| "learning_rate": 1.4172284644194758e-05, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.9459109544754029, |
| "num_tokens": 43209.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4981273408239701, |
| "grad_norm": 0.12153730541467667, |
| "learning_rate": 1.402247191011236e-05, |
| "loss": 0.1908, |
| "mean_token_accuracy": 0.9495814442634583, |
| "num_tokens": 44320.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4981273408239701, |
| "eval_loss": 0.19722126424312592, |
| "eval_mean_token_accuracy": 0.9426012635231018, |
| "eval_num_tokens": 44320.0, |
| "eval_runtime": 2.4929, |
| "eval_samples_per_second": 11.633, |
| "eval_steps_per_second": 1.605, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.5355805243445693, |
| "grad_norm": 0.13351161777973175, |
| "learning_rate": 1.3872659176029963e-05, |
| "loss": 0.1906, |
| "mean_token_accuracy": 0.9469557940959931, |
| "num_tokens": 45434.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5730337078651684, |
| "grad_norm": 0.1454717516899109, |
| "learning_rate": 1.3722846441947566e-05, |
| "loss": 0.1906, |
| "mean_token_accuracy": 0.9468878388404847, |
| "num_tokens": 46547.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.6104868913857677, |
| "grad_norm": 0.21453846991062164, |
| "learning_rate": 1.3573033707865169e-05, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.9432088494300842, |
| "num_tokens": 47659.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.647940074906367, |
| "grad_norm": 0.1796715408563614, |
| "learning_rate": 1.3423220973782773e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9468723952770233, |
| "num_tokens": 48771.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.6853932584269664, |
| "grad_norm": 0.18729475140571594, |
| "learning_rate": 1.3273408239700376e-05, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.9448257863521576, |
| "num_tokens": 49878.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.7228464419475655, |
| "grad_norm": 0.20833182334899902, |
| "learning_rate": 1.3123595505617978e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9460108697414398, |
| "num_tokens": 50990.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7602996254681647, |
| "grad_norm": 0.09931682050228119, |
| "learning_rate": 1.2973782771535581e-05, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9476029396057128, |
| "num_tokens": 52099.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.797752808988764, |
| "grad_norm": 0.2103966772556305, |
| "learning_rate": 1.2823970037453184e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9421666264533997, |
| "num_tokens": 53208.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.8352059925093633, |
| "grad_norm": 0.07852394878864288, |
| "learning_rate": 1.2674157303370786e-05, |
| "loss": 0.1915, |
| "mean_token_accuracy": 0.9441100597381592, |
| "num_tokens": 54319.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8726591760299627, |
| "grad_norm": 0.09249723702669144, |
| "learning_rate": 1.2524344569288391e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9484964370727539, |
| "num_tokens": 55426.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8726591760299627, |
| "eval_loss": 0.19536998867988586, |
| "eval_mean_token_accuracy": 0.945041760802269, |
| "eval_num_tokens": 55426.0, |
| "eval_runtime": 2.499, |
| "eval_samples_per_second": 11.605, |
| "eval_steps_per_second": 1.601, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.9101123595505618, |
| "grad_norm": 0.07890783250331879, |
| "learning_rate": 1.2374531835205994e-05, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9412918269634247, |
| "num_tokens": 56536.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.947565543071161, |
| "grad_norm": 0.2816140353679657, |
| "learning_rate": 1.2224719101123596e-05, |
| "loss": 0.1923, |
| "mean_token_accuracy": 0.9376968383789063, |
| "num_tokens": 57648.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9850187265917603, |
| "grad_norm": 0.08590656518936157, |
| "learning_rate": 1.2074906367041199e-05, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.9467627465724945, |
| "num_tokens": 58758.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.0224719101123596, |
| "grad_norm": 0.1013297438621521, |
| "learning_rate": 1.1925093632958802e-05, |
| "loss": 0.1903, |
| "mean_token_accuracy": 0.9485378265380859, |
| "num_tokens": 59811.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.059925093632959, |
| "grad_norm": 0.07267877459526062, |
| "learning_rate": 1.1775280898876404e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9469048321247101, |
| "num_tokens": 60923.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.097378277153558, |
| "grad_norm": 0.08559578657150269, |
| "learning_rate": 1.1625468164794009e-05, |
| "loss": 0.1913, |
| "mean_token_accuracy": 0.943006819486618, |
| "num_tokens": 62031.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.134831460674157, |
| "grad_norm": 0.2162655144929886, |
| "learning_rate": 1.1475655430711611e-05, |
| "loss": 0.188, |
| "mean_token_accuracy": 0.9467701494693757, |
| "num_tokens": 63140.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.1722846441947565, |
| "grad_norm": 0.08606795221567154, |
| "learning_rate": 1.1325842696629214e-05, |
| "loss": 0.189, |
| "mean_token_accuracy": 0.9439931452274323, |
| "num_tokens": 64249.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.209737827715356, |
| "grad_norm": 0.2562474310398102, |
| "learning_rate": 1.1176029962546817e-05, |
| "loss": 0.1926, |
| "mean_token_accuracy": 0.9457504689693451, |
| "num_tokens": 65356.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.247191011235955, |
| "grad_norm": 0.0770883709192276, |
| "learning_rate": 1.102621722846442e-05, |
| "loss": 0.1895, |
| "mean_token_accuracy": 0.9449774503707886, |
| "num_tokens": 66466.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.247191011235955, |
| "eval_loss": 0.19561618566513062, |
| "eval_mean_token_accuracy": 0.9387146234512329, |
| "eval_num_tokens": 66466.0, |
| "eval_runtime": 2.498, |
| "eval_samples_per_second": 11.609, |
| "eval_steps_per_second": 1.601, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1335, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3049029865728000.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|