| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.684931506849315, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00684931506849315, | |
| "grad_norm": 166.6063995361328, | |
| "learning_rate": 5e-06, | |
| "loss": 3.913, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0136986301369863, | |
| "grad_norm": 88.24701690673828, | |
| "learning_rate": 1e-05, | |
| "loss": 3.7453, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02054794520547945, | |
| "grad_norm": 78.30535125732422, | |
| "learning_rate": 9.89795918367347e-06, | |
| "loss": 2.1474, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0273972602739726, | |
| "grad_norm": 83.51199340820312, | |
| "learning_rate": 9.795918367346939e-06, | |
| "loss": 2.2588, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.03424657534246575, | |
| "grad_norm": 49.122493743896484, | |
| "learning_rate": 9.693877551020408e-06, | |
| "loss": 1.8187, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0410958904109589, | |
| "grad_norm": 31.191837310791016, | |
| "learning_rate": 9.591836734693878e-06, | |
| "loss": 1.8402, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04794520547945205, | |
| "grad_norm": 43.579193115234375, | |
| "learning_rate": 9.489795918367348e-06, | |
| "loss": 1.8214, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0547945205479452, | |
| "grad_norm": 62.58639144897461, | |
| "learning_rate": 9.387755102040818e-06, | |
| "loss": 2.3602, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.06164383561643835, | |
| "grad_norm": 40.35031509399414, | |
| "learning_rate": 9.285714285714288e-06, | |
| "loss": 1.8161, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0684931506849315, | |
| "grad_norm": 39.441307067871094, | |
| "learning_rate": 9.183673469387756e-06, | |
| "loss": 1.8453, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07534246575342465, | |
| "grad_norm": 36.04607391357422, | |
| "learning_rate": 9.081632653061225e-06, | |
| "loss": 1.4184, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0821917808219178, | |
| "grad_norm": 37.545467376708984, | |
| "learning_rate": 8.979591836734695e-06, | |
| "loss": 1.4609, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.08904109589041095, | |
| "grad_norm": 33.20708465576172, | |
| "learning_rate": 8.877551020408163e-06, | |
| "loss": 1.8817, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0958904109589041, | |
| "grad_norm": 40.35082244873047, | |
| "learning_rate": 8.775510204081633e-06, | |
| "loss": 1.6072, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.10273972602739725, | |
| "grad_norm": 52.9511604309082, | |
| "learning_rate": 8.673469387755103e-06, | |
| "loss": 1.4684, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1095890410958904, | |
| "grad_norm": 41.79618453979492, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 1.7434, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.11643835616438356, | |
| "grad_norm": 39.11565017700195, | |
| "learning_rate": 8.469387755102042e-06, | |
| "loss": 1.7047, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.1232876712328767, | |
| "grad_norm": 45.29304122924805, | |
| "learning_rate": 8.36734693877551e-06, | |
| "loss": 1.9334, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.13013698630136986, | |
| "grad_norm": 24.95364761352539, | |
| "learning_rate": 8.26530612244898e-06, | |
| "loss": 2.2292, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.136986301369863, | |
| "grad_norm": 35.88187789916992, | |
| "learning_rate": 8.16326530612245e-06, | |
| "loss": 2.1195, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.14383561643835616, | |
| "grad_norm": 30.69417381286621, | |
| "learning_rate": 8.06122448979592e-06, | |
| "loss": 1.4519, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1506849315068493, | |
| "grad_norm": 33.624210357666016, | |
| "learning_rate": 7.959183673469388e-06, | |
| "loss": 1.1561, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.15753424657534246, | |
| "grad_norm": 29.83182144165039, | |
| "learning_rate": 7.857142857142858e-06, | |
| "loss": 1.9838, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.1643835616438356, | |
| "grad_norm": 30.646284103393555, | |
| "learning_rate": 7.755102040816327e-06, | |
| "loss": 1.9414, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.17123287671232876, | |
| "grad_norm": 32.19529724121094, | |
| "learning_rate": 7.653061224489796e-06, | |
| "loss": 1.4498, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.1780821917808219, | |
| "grad_norm": 41.54957580566406, | |
| "learning_rate": 7.551020408163265e-06, | |
| "loss": 2.3265, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.18493150684931506, | |
| "grad_norm": 62.288414001464844, | |
| "learning_rate": 7.448979591836736e-06, | |
| "loss": 3.0618, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1917808219178082, | |
| "grad_norm": 32.133243560791016, | |
| "learning_rate": 7.346938775510205e-06, | |
| "loss": 1.7778, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.19863013698630136, | |
| "grad_norm": 37.86830520629883, | |
| "learning_rate": 7.244897959183675e-06, | |
| "loss": 1.9299, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.2054794520547945, | |
| "grad_norm": 27.436555862426758, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 1.485, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.21232876712328766, | |
| "grad_norm": 27.29859161376953, | |
| "learning_rate": 7.0408163265306125e-06, | |
| "loss": 1.7159, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2191780821917808, | |
| "grad_norm": 33.742835998535156, | |
| "learning_rate": 6.938775510204082e-06, | |
| "loss": 1.7625, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.22602739726027396, | |
| "grad_norm": 38.45572280883789, | |
| "learning_rate": 6.836734693877551e-06, | |
| "loss": 1.8054, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.2328767123287671, | |
| "grad_norm": 29.832292556762695, | |
| "learning_rate": 6.734693877551021e-06, | |
| "loss": 1.6, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.23972602739726026, | |
| "grad_norm": 32.1478157043457, | |
| "learning_rate": 6.63265306122449e-06, | |
| "loss": 1.9878, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2465753424657534, | |
| "grad_norm": 21.848527908325195, | |
| "learning_rate": 6.530612244897959e-06, | |
| "loss": 1.9843, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2534246575342466, | |
| "grad_norm": 35.852169036865234, | |
| "learning_rate": 6.4285714285714295e-06, | |
| "loss": 1.7578, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2602739726027397, | |
| "grad_norm": 27.202524185180664, | |
| "learning_rate": 6.326530612244899e-06, | |
| "loss": 1.6341, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2671232876712329, | |
| "grad_norm": 28.326839447021484, | |
| "learning_rate": 6.224489795918368e-06, | |
| "loss": 2.0204, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.273972602739726, | |
| "grad_norm": 20.435285568237305, | |
| "learning_rate": 6.122448979591837e-06, | |
| "loss": 1.6218, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2808219178082192, | |
| "grad_norm": 81.60685729980469, | |
| "learning_rate": 6.020408163265307e-06, | |
| "loss": 1.6894, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2876712328767123, | |
| "grad_norm": 31.553621292114258, | |
| "learning_rate": 5.918367346938776e-06, | |
| "loss": 1.7057, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2945205479452055, | |
| "grad_norm": 28.031139373779297, | |
| "learning_rate": 5.816326530612246e-06, | |
| "loss": 1.5383, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.3013698630136986, | |
| "grad_norm": 23.66860008239746, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 1.5447, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.3082191780821918, | |
| "grad_norm": 34.923824310302734, | |
| "learning_rate": 5.6122448979591834e-06, | |
| "loss": 1.8337, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3150684931506849, | |
| "grad_norm": 20.53199577331543, | |
| "learning_rate": 5.510204081632653e-06, | |
| "loss": 2.0762, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3219178082191781, | |
| "grad_norm": 22.510568618774414, | |
| "learning_rate": 5.408163265306123e-06, | |
| "loss": 1.7565, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3287671232876712, | |
| "grad_norm": 24.311054229736328, | |
| "learning_rate": 5.306122448979593e-06, | |
| "loss": 1.8084, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3356164383561644, | |
| "grad_norm": 28.965652465820312, | |
| "learning_rate": 5.204081632653062e-06, | |
| "loss": 1.6366, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.3424657534246575, | |
| "grad_norm": 14.990488052368164, | |
| "learning_rate": 5.1020408163265315e-06, | |
| "loss": 0.8044, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3493150684931507, | |
| "grad_norm": 30.92911148071289, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9733, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.3561643835616438, | |
| "grad_norm": 25.58120346069336, | |
| "learning_rate": 4.897959183673469e-06, | |
| "loss": 1.8185, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.363013698630137, | |
| "grad_norm": 20.015125274658203, | |
| "learning_rate": 4.795918367346939e-06, | |
| "loss": 1.8311, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3698630136986301, | |
| "grad_norm": 31.811553955078125, | |
| "learning_rate": 4.693877551020409e-06, | |
| "loss": 2.0963, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3767123287671233, | |
| "grad_norm": 20.206634521484375, | |
| "learning_rate": 4.591836734693878e-06, | |
| "loss": 0.6103, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3835616438356164, | |
| "grad_norm": 19.538440704345703, | |
| "learning_rate": 4.489795918367348e-06, | |
| "loss": 2.4183, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3904109589041096, | |
| "grad_norm": 77.06710052490234, | |
| "learning_rate": 4.3877551020408165e-06, | |
| "loss": 2.3644, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3972602739726027, | |
| "grad_norm": 31.162639617919922, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 2.1601, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.4041095890410959, | |
| "grad_norm": 22.61947250366211, | |
| "learning_rate": 4.183673469387755e-06, | |
| "loss": 1.1591, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.410958904109589, | |
| "grad_norm": 37.861270904541016, | |
| "learning_rate": 4.081632653061225e-06, | |
| "loss": 2.2028, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4178082191780822, | |
| "grad_norm": 17.079059600830078, | |
| "learning_rate": 3.979591836734694e-06, | |
| "loss": 1.7653, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.4246575342465753, | |
| "grad_norm": 28.447805404663086, | |
| "learning_rate": 3.877551020408164e-06, | |
| "loss": 1.8112, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.4315068493150685, | |
| "grad_norm": 33.241146087646484, | |
| "learning_rate": 3.7755102040816327e-06, | |
| "loss": 1.6641, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.4383561643835616, | |
| "grad_norm": 30.07863998413086, | |
| "learning_rate": 3.6734693877551024e-06, | |
| "loss": 2.1975, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.4452054794520548, | |
| "grad_norm": 37.52287292480469, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.9424, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4520547945205479, | |
| "grad_norm": 28.23394203186035, | |
| "learning_rate": 3.469387755102041e-06, | |
| "loss": 1.8685, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.4589041095890411, | |
| "grad_norm": 28.84389305114746, | |
| "learning_rate": 3.3673469387755105e-06, | |
| "loss": 1.1797, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.4657534246575342, | |
| "grad_norm": 29.903711318969727, | |
| "learning_rate": 3.2653061224489794e-06, | |
| "loss": 2.0214, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4726027397260274, | |
| "grad_norm": 36.350582122802734, | |
| "learning_rate": 3.1632653061224496e-06, | |
| "loss": 2.2569, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4794520547945205, | |
| "grad_norm": 31.127033233642578, | |
| "learning_rate": 3.0612244897959185e-06, | |
| "loss": 1.3549, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4863013698630137, | |
| "grad_norm": 47.7249755859375, | |
| "learning_rate": 2.959183673469388e-06, | |
| "loss": 2.2439, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.4931506849315068, | |
| "grad_norm": 39.26190185546875, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 1.8277, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 25.255807876586914, | |
| "learning_rate": 2.7551020408163266e-06, | |
| "loss": 1.8571, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.5068493150684932, | |
| "grad_norm": 25.397775650024414, | |
| "learning_rate": 2.6530612244897964e-06, | |
| "loss": 1.5798, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.5136986301369864, | |
| "grad_norm": 18.698389053344727, | |
| "learning_rate": 2.5510204081632657e-06, | |
| "loss": 1.8486, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5205479452054794, | |
| "grad_norm": 17.65583610534668, | |
| "learning_rate": 2.4489795918367347e-06, | |
| "loss": 0.3439, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5273972602739726, | |
| "grad_norm": 29.23255729675293, | |
| "learning_rate": 2.3469387755102044e-06, | |
| "loss": 1.4103, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.5342465753424658, | |
| "grad_norm": 20.359149932861328, | |
| "learning_rate": 2.244897959183674e-06, | |
| "loss": 1.8338, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.541095890410959, | |
| "grad_norm": 30.629518508911133, | |
| "learning_rate": 2.1428571428571427e-06, | |
| "loss": 1.9468, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 18.888212203979492, | |
| "learning_rate": 2.0408163265306125e-06, | |
| "loss": 1.6648, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5547945205479452, | |
| "grad_norm": 32.43148422241211, | |
| "learning_rate": 1.938775510204082e-06, | |
| "loss": 1.2884, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5616438356164384, | |
| "grad_norm": 23.12215805053711, | |
| "learning_rate": 1.8367346938775512e-06, | |
| "loss": 1.6872, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.5684931506849316, | |
| "grad_norm": 17.43967056274414, | |
| "learning_rate": 1.7346938775510206e-06, | |
| "loss": 1.7704, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5753424657534246, | |
| "grad_norm": 23.49708366394043, | |
| "learning_rate": 1.6326530612244897e-06, | |
| "loss": 1.4206, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5821917808219178, | |
| "grad_norm": 23.952125549316406, | |
| "learning_rate": 1.5306122448979593e-06, | |
| "loss": 1.2965, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.589041095890411, | |
| "grad_norm": 26.057159423828125, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 1.5459, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5958904109589042, | |
| "grad_norm": 28.730058670043945, | |
| "learning_rate": 1.3265306122448982e-06, | |
| "loss": 1.2233, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.6027397260273972, | |
| "grad_norm": 25.128461837768555, | |
| "learning_rate": 1.2244897959183673e-06, | |
| "loss": 1.677, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.6095890410958904, | |
| "grad_norm": 21.31721305847168, | |
| "learning_rate": 1.122448979591837e-06, | |
| "loss": 1.7159, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.6164383561643836, | |
| "grad_norm": 24.109394073486328, | |
| "learning_rate": 1.0204081632653063e-06, | |
| "loss": 1.3831, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6232876712328768, | |
| "grad_norm": 32.16295623779297, | |
| "learning_rate": 9.183673469387756e-07, | |
| "loss": 1.4889, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.6301369863013698, | |
| "grad_norm": 23.5495548248291, | |
| "learning_rate": 8.163265306122449e-07, | |
| "loss": 1.6685, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.636986301369863, | |
| "grad_norm": 31.10687828063965, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 1.1017, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6438356164383562, | |
| "grad_norm": 27.091115951538086, | |
| "learning_rate": 6.122448979591837e-07, | |
| "loss": 1.9554, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6506849315068494, | |
| "grad_norm": 29.03687286376953, | |
| "learning_rate": 5.102040816326531e-07, | |
| "loss": 1.4978, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6575342465753424, | |
| "grad_norm": 21.225196838378906, | |
| "learning_rate": 4.0816326530612243e-07, | |
| "loss": 1.8585, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6643835616438356, | |
| "grad_norm": 27.35712242126465, | |
| "learning_rate": 3.0612244897959183e-07, | |
| "loss": 1.3636, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.6712328767123288, | |
| "grad_norm": 26.40558624267578, | |
| "learning_rate": 2.0408163265306121e-07, | |
| "loss": 1.5249, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.678082191780822, | |
| "grad_norm": 29.14642906188965, | |
| "learning_rate": 1.0204081632653061e-07, | |
| "loss": 2.0083, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "grad_norm": 18.334836959838867, | |
| "learning_rate": 0.0, | |
| "loss": 1.4028, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "step": 100, | |
| "total_flos": 2305515375820800.0, | |
| "train_loss": 1.771125696003437, | |
| "train_runtime": 5623.6023, | |
| "train_samples_per_second": 0.018, | |
| "train_steps_per_second": 0.018 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2305515375820800.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |