| --- RESUMING FROM: /home/skiredj.abderrahman/khalil/sbert_training/output/arabert_20260224_1730/checkpoint-18000 --- |
| {'loss': '0.8727', 'grad_norm': '5.438', 'learning_rate': '1.568e-05', 'epoch': '0.5891'} |
| {'loss': '0.8524', 'grad_norm': '5.458', 'learning_rate': '1.56e-05', 'epoch': '0.5956'} |
| {'loss': '0.8995', 'grad_norm': '6.666', 'learning_rate': '1.553e-05', 'epoch': '0.6021'} |
| {'loss': '0.836', 'grad_norm': '5.681', 'learning_rate': '1.546e-05', 'epoch': '0.6086'} |
| {'loss': '0.8628', 'grad_norm': '6.571', 'learning_rate': '1.539e-05', 'epoch': '0.615'} |
| {'loss': '0.8244', 'grad_norm': '6.389', 'learning_rate': '1.532e-05', 'epoch': '0.6215'} |
| {'loss': '0.8647', 'grad_norm': '4.987', 'learning_rate': '1.525e-05', 'epoch': '0.628'} |
| {'loss': '0.8479', 'grad_norm': '4.451', 'learning_rate': '1.517e-05', 'epoch': '0.6345'} |
| {'loss': '0.8204', 'grad_norm': '5.356', 'learning_rate': '1.51e-05', 'epoch': '0.6409'} |
| {'loss': '0.8359', 'grad_norm': '5.146', 'learning_rate': '1.503e-05', 'epoch': '0.6474'} |
| {'loss': '0.7952', 'grad_norm': '4.308', 'learning_rate': '1.496e-05', 'epoch': '0.6539'} |
| {'loss': '0.8375', 'grad_norm': '5.216', 'learning_rate': '1.489e-05', 'epoch': '0.6604'} |
| {'loss': '0.8364', 'grad_norm': '5.812', 'learning_rate': '1.481e-05', 'epoch': '0.6668'} |
| {'loss': '0.8131', 'grad_norm': '5.52', 'learning_rate': '1.474e-05', 'epoch': '0.6733'} |
| {'loss': '0.831', 'grad_norm': '6.452', 'learning_rate': '1.467e-05', 'epoch': '0.6798'} |
| {'loss': '0.8295', 'grad_norm': '4.274', 'learning_rate': '1.46e-05', 'epoch': '0.6863'} |
| {'loss': '0.7865', 'grad_norm': '4.77', 'learning_rate': '1.453e-05', 'epoch': '0.6927'} |
| {'loss': '0.796', 'grad_norm': '5.027', 'learning_rate': '1.445e-05', 'epoch': '0.6992'} |
| {'loss': '0.8287', 'grad_norm': '4.826', 'learning_rate': '1.438e-05', 'epoch': '0.7057'} |
| {'loss': '0.8214', 'grad_norm': '4.381', 'learning_rate': '1.431e-05', 'epoch': '0.7121'} |
| {'loss': '0.7879', 'grad_norm': '6.475', 'learning_rate': '1.424e-05', 'epoch': '0.7186'} |
| {'loss': '0.8139', 'grad_norm': '5.295', 'learning_rate': '1.417e-05', 'epoch': '0.7251'} |
| {'loss': '0.7849', 'grad_norm': '5.051', 'learning_rate': '1.409e-05', 'epoch': '0.7316'} |
| {'loss': '0.788', 'grad_norm': '5.113', 'learning_rate': '1.402e-05', 'epoch': '0.738'} |
| {'loss': '0.7725', 'grad_norm': '4.049', 'learning_rate': '1.395e-05', 'epoch': '0.7445'} |
| {'loss': '0.8086', 'grad_norm': '4.646', 'learning_rate': '1.388e-05', 'epoch': '0.751'} |
| {'loss': '0.7687', 'grad_norm': '5.049', 'learning_rate': '1.381e-05', 'epoch': '0.7575'} |
| {'loss': '0.7828', 'grad_norm': '6.568', 'learning_rate': '1.373e-05', 'epoch': '0.7639'} |
| {'loss': '0.7518', 'grad_norm': '5.9', 'learning_rate': '1.366e-05', 'epoch': '0.7704'} |
| {'loss': '0.7599', 'grad_norm': '6.338', 'learning_rate': '1.359e-05', 'epoch': '0.7769'} |
| {'eval_train_loss': '0.4041', 'eval_dev-768_cosine_accuracy': '0.9737', 'eval_dev-512_cosine_accuracy': '0.9738', 'eval_dev-256_cosine_accuracy': '0.9738', 'eval_dev-128_cosine_accuracy': '0.9734', 'eval_dev-64_cosine_accuracy': '0.9718', 'eval_sequential_score': '0.9737', 'eval_train_runtime': '9673', 'eval_train_samples_per_second': '116.8', 'eval_train_steps_per_second': '14.6', 'epoch': '0.7769'} |
| {'loss': '0.7332', 'grad_norm': '4.95', 'learning_rate': '1.352e-05', 'epoch': '0.7834'} |
| {'loss': '0.7476', 'grad_norm': '4.513', 'learning_rate': '1.345e-05', 'epoch': '0.7898'} |
| {'loss': '0.7806', 'grad_norm': '5.095', 'learning_rate': '1.337e-05', 'epoch': '0.7963'} |
| {'loss': '0.7511', 'grad_norm': '5.826', 'learning_rate': '1.33e-05', 'epoch': '0.8028'} |
| {'loss': '0.7652', 'grad_norm': '6.09', 'learning_rate': '1.323e-05', 'epoch': '0.8093'} |
| {'loss': '0.7883', 'grad_norm': '4.332', 'learning_rate': '1.316e-05', 'epoch': '0.8157'} |
| {'loss': '0.7305', 'grad_norm': '5.749', 'learning_rate': '1.309e-05', 'epoch': '0.8222'} |
| {'loss': '0.7308', 'grad_norm': '4.871', 'learning_rate': '1.302e-05', 'epoch': '0.8287'} |
| {'loss': '0.7368', 'grad_norm': '4.618', 'learning_rate': '1.294e-05', 'epoch': '0.8352'} |
| {'loss': '0.7432', 'grad_norm': '4.836', 'learning_rate': '1.287e-05', 'epoch': '0.8416'} |
| {'loss': '0.7046', 'grad_norm': '4.988', 'learning_rate': '1.28e-05', 'epoch': '0.8481'} |
| {'loss': '0.7476', 'grad_norm': '4.596', 'learning_rate': '1.273e-05', 'epoch': '0.8546'} |
| {'loss': '0.7212', 'grad_norm': '5.712', 'learning_rate': '1.266e-05', 'epoch': '0.8611'} |
| {'loss': '0.7335', 'grad_norm': '3.99', 'learning_rate': '1.258e-05', 'epoch': '0.8675'} |
| {'loss': '0.7415', 'grad_norm': '5.446', 'learning_rate': '1.251e-05', 'epoch': '0.874'} |
| {'loss': '0.6937', 'grad_norm': '5.257', 'learning_rate': '1.244e-05', 'epoch': '0.8805'} |
| {'loss': '0.7294', 'grad_norm': '5.302', 'learning_rate': '1.237e-05', 'epoch': '0.8869'} |
| {'loss': '0.7436', 'grad_norm': '3.847', 'learning_rate': '1.23e-05', 'epoch': '0.8934'} |
| {'loss': '0.7093', 'grad_norm': '6.182', 'learning_rate': '1.222e-05', 'epoch': '0.8999'} |
| {'loss': '0.748', 'grad_norm': '5.445', 'learning_rate': '1.215e-05', 'epoch': '0.9064'} |
| {'loss': '0.7039', 'grad_norm': '5.002', 'learning_rate': '1.208e-05', 'epoch': '0.9128'} |
| {'loss': '0.7091', 'grad_norm': '5.085', 'learning_rate': '1.201e-05', 'epoch': '0.9193'} |
| {'loss': '0.7019', 'grad_norm': '5.379', 'learning_rate': '1.194e-05', 'epoch': '0.9258'} |
| {'loss': '0.7081', 'grad_norm': '5.63', 'learning_rate': '1.186e-05', 'epoch': '0.9323'} |
| {'loss': '0.6833', 'grad_norm': '2.541', 'learning_rate': '1.179e-05', 'epoch': '0.9387'} |
| {'loss': '0.6982', 'grad_norm': '5.714', 'learning_rate': '1.172e-05', 'epoch': '0.9452'} |
| {'loss': '0.7249', 'grad_norm': '5.051', 'learning_rate': '1.165e-05', 'epoch': '0.9517'} |
| {'loss': '0.7282', 'grad_norm': '6.322', 'learning_rate': '1.158e-05', 'epoch': '0.9582'} |
| {'loss': '0.7147', 'grad_norm': '4.961', 'learning_rate': '1.15e-05', 'epoch': '0.9646'} |
| {'loss': '0.6742', 'grad_norm': '4.871', 'learning_rate': '1.143e-05', 'epoch': '0.9711'} |
| {'eval_train_loss': '0.364', 'eval_dev-768_cosine_accuracy': '0.9758', 'eval_dev-512_cosine_accuracy': '0.9759', 'eval_dev-256_cosine_accuracy': '0.9761', 'eval_dev-128_cosine_accuracy': '0.9757', 'eval_dev-64_cosine_accuracy': '0.9742', 'eval_sequential_score': '0.9758', 'eval_train_runtime': '9649', 'eval_train_samples_per_second': '117.1', 'eval_train_steps_per_second': '14.64', 'epoch': '0.9711'} |
| {'loss': '0.6901', 'grad_norm': '3.348', 'learning_rate': '1.136e-05', 'epoch': '0.9776'} |
| {'loss': '0.7067', 'grad_norm': '3.76', 'learning_rate': '1.129e-05', 'epoch': '0.9841'} |
| {'loss': '0.7166', 'grad_norm': '4.729', 'learning_rate': '1.122e-05', 'epoch': '0.9905'} |
| {'loss': '0.68', 'grad_norm': '4.648', 'learning_rate': '1.114e-05', 'epoch': '0.997'} |
| {'loss': '0.6846', 'grad_norm': '4.427', 'learning_rate': '1.107e-05', 'epoch': '1.003'} |
| {'loss': '0.6723', 'grad_norm': '4.459', 'learning_rate': '1.1e-05', 'epoch': '1.01'} |
| {'loss': '0.6573', 'grad_norm': '6.387', 'learning_rate': '1.093e-05', 'epoch': '1.016'} |
| {'loss': '0.6895', 'grad_norm': '4.1', 'learning_rate': '1.086e-05', 'epoch': '1.023'} |
| {'loss': '0.6588', 'grad_norm': '5.927', 'learning_rate': '1.079e-05', 'epoch': '1.029'} |
| {'loss': '0.6517', 'grad_norm': '5.9', 'learning_rate': '1.071e-05', 'epoch': '1.036'} |
| {'loss': '0.6498', 'grad_norm': '4.736', 'learning_rate': '1.064e-05', 'epoch': '1.042'} |
| {'loss': '0.6836', 'grad_norm': '5.029', 'learning_rate': '1.057e-05', 'epoch': '1.049'} |
| {'loss': '0.6819', 'grad_norm': '2.595', 'learning_rate': '1.05e-05', 'epoch': '1.055'} |
| {'loss': '0.6463', 'grad_norm': '4.963', 'learning_rate': '1.043e-05', 'epoch': '1.062'} |
| {'loss': '0.6645', 'grad_norm': '5.046', 'learning_rate': '1.035e-05', 'epoch': '1.068'} |
| {'loss': '0.6518', 'grad_norm': '3.307', 'learning_rate': '1.028e-05', 'epoch': '1.075'} |
| {'loss': '0.6235', 'grad_norm': '3.848', 'learning_rate': '1.021e-05', 'epoch': '1.081'} |
| {'loss': '0.6302', 'grad_norm': '4.664', 'learning_rate': '1.014e-05', 'epoch': '1.088'} |
| {'loss': '0.6452', 'grad_norm': '5.47', 'learning_rate': '1.007e-05', 'epoch': '1.094'} |
| {'loss': '0.6477', 'grad_norm': '5.26', 'learning_rate': '9.994e-06', 'epoch': '1.101'} |
| {'loss': '0.6084', 'grad_norm': '4.313', 'learning_rate': '9.922e-06', 'epoch': '1.107'} |
| {'loss': '0.6259', 'grad_norm': '6.499', 'learning_rate': '9.85e-06', 'epoch': '1.114'} |
| {'loss': '0.607', 'grad_norm': '3.922', 'learning_rate': '9.778e-06', 'epoch': '1.12'} |
| {'loss': '0.5977', 'grad_norm': '5.37', 'learning_rate': '9.706e-06', 'epoch': '1.126'} |
| {'loss': '0.6044', 'grad_norm': '5.068', 'learning_rate': '9.634e-06', 'epoch': '1.133'} |
| {'loss': '0.6007', 'grad_norm': '4.109', 'learning_rate': '9.562e-06', 'epoch': '1.139'} |
| {'loss': '0.5628', 'grad_norm': '4.954', 'learning_rate': '9.491e-06', 'epoch': '1.146'} |
| {'loss': '0.5732', 'grad_norm': '4.068', 'learning_rate': '9.419e-06', 'epoch': '1.152'} |
| {'loss': '0.5773', 'grad_norm': '4.939', 'learning_rate': '9.347e-06', 'epoch': '1.159'} |
| {'loss': '0.5719', 'grad_norm': '4.418', 'learning_rate': '9.275e-06', 'epoch': '1.165'} |
| {'eval_train_loss': '0.3356', 'eval_dev-768_cosine_accuracy': '0.9775', 'eval_dev-512_cosine_accuracy': '0.9777', 'eval_dev-256_cosine_accuracy': '0.9777', 'eval_dev-128_cosine_accuracy': '0.9774', 'eval_dev-64_cosine_accuracy': '0.976', 'eval_sequential_score': '0.9775', 'eval_train_runtime': '1.01e+04', 'eval_train_samples_per_second': '111.8', 'eval_train_steps_per_second': '13.98', 'epoch': '1.165'} |
| {'loss': '0.5471', 'grad_norm': '3.58', 'learning_rate': '9.203e-06', 'epoch': '1.172'} |
| {'loss': '0.5635', 'grad_norm': '5.198', 'learning_rate': '9.131e-06', 'epoch': '1.178'} |
| {'loss': '0.539', 'grad_norm': '4.468', 'learning_rate': '9.059e-06', 'epoch': '1.185'} |
| {'loss': '0.5428', 'grad_norm': '4.349', 'learning_rate': '8.987e-06', 'epoch': '1.191'} |
| {'loss': '0.5205', 'grad_norm': '2.936', 'learning_rate': '8.915e-06', 'epoch': '1.198'} |
| {'loss': '0.5362', 'grad_norm': '3.337', 'learning_rate': '8.843e-06', 'epoch': '1.204'} |
| {'loss': '0.5386', 'grad_norm': '5.76', 'learning_rate': '8.771e-06', 'epoch': '1.211'} |
| {'loss': '0.5203', 'grad_norm': '3.261', 'learning_rate': '8.699e-06', 'epoch': '1.217'} |
| {'loss': '0.5301', 'grad_norm': '3.732', 'learning_rate': '8.627e-06', 'epoch': '1.224'} |
| {'loss': '0.5232', 'grad_norm': '4.54', 'learning_rate': '8.555e-06', 'epoch': '1.23'} |
| {'loss': '0.4922', 'grad_norm': '4.291', 'learning_rate': '8.483e-06', 'epoch': '1.237'} |
| {'loss': '0.5029', 'grad_norm': '3.979', 'learning_rate': '8.412e-06', 'epoch': '1.243'} |
| {'loss': '0.4989', 'grad_norm': '7.829', 'learning_rate': '8.34e-06', 'epoch': '1.249'} |
| {'loss': '0.5053', 'grad_norm': '2.903', 'learning_rate': '8.268e-06', 'epoch': '1.256'} |
| {'loss': '0.5081', 'grad_norm': '5.471', 'learning_rate': '8.196e-06', 'epoch': '1.262'} |
| {'loss': '0.496', 'grad_norm': '5.204', 'learning_rate': '8.124e-06', 'epoch': '1.269'} |
| {'loss': '0.5052', 'grad_norm': '4.377', 'learning_rate': '8.052e-06', 'epoch': '1.275'} |
| {'loss': '0.4984', 'grad_norm': '4.184', 'learning_rate': '7.98e-06', 'epoch': '1.282'} |
| {'loss': '0.4909', 'grad_norm': '4.991', 'learning_rate': '7.908e-06', 'epoch': '1.288'} |
| {'loss': '0.512', 'grad_norm': '3.76', 'learning_rate': '7.836e-06', 'epoch': '1.295'} |
| {'loss': '0.4873', 'grad_norm': '3.844', 'learning_rate': '7.764e-06', 'epoch': '1.301'} |
| {'loss': '0.4896', 'grad_norm': '6.987', 'learning_rate': '7.692e-06', 'epoch': '1.308'} |
| {'loss': '0.49', 'grad_norm': '6.267', 'learning_rate': '7.62e-06', 'epoch': '1.314'} |
| {'loss': '0.5036', 'grad_norm': '3.776', 'learning_rate': '7.548e-06', 'epoch': '1.321'} |
| {'loss': '0.4876', 'grad_norm': '3.42', 'learning_rate': '7.476e-06', 'epoch': '1.327'} |
| {'loss': '0.4705', 'grad_norm': '5.478', 'learning_rate': '7.404e-06', 'epoch': '1.334'} |
| {'loss': '0.4786', 'grad_norm': '3.313', 'learning_rate': '7.333e-06', 'epoch': '1.34'} |
| {'loss': '0.4998', 'grad_norm': '3.13', 'learning_rate': '7.261e-06', 'epoch': '1.347'} |
| {'loss': '0.4692', 'grad_norm': '3.971', 'learning_rate': '7.189e-06', 'epoch': '1.353'} |
| {'loss': '0.5064', 'grad_norm': '6.238', 'learning_rate': '7.117e-06', 'epoch': '1.36'} |
| {'eval_train_loss': '0.316', 'eval_dev-768_cosine_accuracy': '0.9788', 'eval_dev-512_cosine_accuracy': '0.979', 'eval_dev-256_cosine_accuracy': '0.979', 'eval_dev-128_cosine_accuracy': '0.9785', 'eval_dev-64_cosine_accuracy': '0.9774', 'eval_sequential_score': '0.9788', 'eval_train_runtime': '1.014e+04', 'eval_train_samples_per_second': '111.5', 'eval_train_steps_per_second': '13.93', 'epoch': '1.36'} |
| {'loss': '0.4925', 'grad_norm': '5.158', 'learning_rate': '7.045e-06', 'epoch': '1.366'} |
| {'loss': '0.4601', 'grad_norm': '4.139', 'learning_rate': '6.973e-06', 'epoch': '1.372'} |
| {'loss': '0.4762', 'grad_norm': '3.411', 'learning_rate': '6.901e-06', 'epoch': '1.379'} |
| {'loss': '0.4986', 'grad_norm': '4.23', 'learning_rate': '6.829e-06', 'epoch': '1.385'} |
| {'loss': '0.4656', 'grad_norm': '5.326', 'learning_rate': '6.757e-06', 'epoch': '1.392'} |
| {'loss': '0.4507', 'grad_norm': '3.826', 'learning_rate': '6.685e-06', 'epoch': '1.398'} |
| {'loss': '0.4862', 'grad_norm': '3.509', 'learning_rate': '6.613e-06', 'epoch': '1.405'} |
| {'loss': '0.4596', 'grad_norm': '4.734', 'learning_rate': '6.541e-06', 'epoch': '1.411'} |
| {'loss': '0.4696', 'grad_norm': '4.799', 'learning_rate': '6.469e-06', 'epoch': '1.418'} |
| {'loss': '0.4925', 'grad_norm': '4.942', 'learning_rate': '6.397e-06', 'epoch': '1.424'} |
| {'loss': '0.4796', 'grad_norm': '4.147', 'learning_rate': '6.325e-06', 'epoch': '1.431'} |
| {'loss': '0.4525', 'grad_norm': '5.146', 'learning_rate': '6.254e-06', 'epoch': '1.437'} |
| {'loss': '0.4717', 'grad_norm': '3.52', 'learning_rate': '6.182e-06', 'epoch': '1.444'} |
| {'loss': '0.4803', 'grad_norm': '3.25', 'learning_rate': '6.11e-06', 'epoch': '1.45'} |
| {'loss': '0.4675', 'grad_norm': '7.35', 'learning_rate': '6.038e-06', 'epoch': '1.457'} |
| {'loss': '0.4631', 'grad_norm': '3.847', 'learning_rate': '5.966e-06', 'epoch': '1.463'} |
| {'loss': '0.4622', 'grad_norm': '4.57', 'learning_rate': '5.894e-06', 'epoch': '1.47'} |
| {'loss': '0.4496', 'grad_norm': '1.997', 'learning_rate': '5.822e-06', 'epoch': '1.476'} |
| {'loss': '0.4678', 'grad_norm': '4.266', 'learning_rate': '5.75e-06', 'epoch': '1.483'} |
| {'loss': '0.4495', 'grad_norm': '5.948', 'learning_rate': '5.678e-06', 'epoch': '1.489'} |
| {'loss': '0.4474', 'grad_norm': '3.7', 'learning_rate': '5.606e-06', 'epoch': '1.495'} |
| {'loss': '0.4587', 'grad_norm': '2.877', 'learning_rate': '5.534e-06', 'epoch': '1.502'} |
| {'loss': '0.4591', 'grad_norm': '4.245', 'learning_rate': '5.462e-06', 'epoch': '1.508'} |
| {'loss': '0.4573', 'grad_norm': '5.431', 'learning_rate': '5.39e-06', 'epoch': '1.515'} |
| {'loss': '0.4442', 'grad_norm': '3.338', 'learning_rate': '5.318e-06', 'epoch': '1.521'} |
| {'loss': '0.455', 'grad_norm': '4.723', 'learning_rate': '5.246e-06', 'epoch': '1.528'} |
| {'loss': '0.4493', 'grad_norm': '4.226', 'learning_rate': '5.175e-06', 'epoch': '1.534'} |
| {'loss': '0.4485', 'grad_norm': '4.451', 'learning_rate': '5.103e-06', 'epoch': '1.541'} |
| {'loss': '0.4569', 'grad_norm': '4.297', 'learning_rate': '5.031e-06', 'epoch': '1.547'} |
| {'loss': '0.4346', 'grad_norm': '4.199', 'learning_rate': '4.959e-06', 'epoch': '1.554'} |
| {'eval_train_loss': '0.3001', 'eval_dev-768_cosine_accuracy': '0.9799', 'eval_dev-512_cosine_accuracy': '0.9802', 'eval_dev-256_cosine_accuracy': '0.9802', 'eval_dev-128_cosine_accuracy': '0.9798', 'eval_dev-64_cosine_accuracy': '0.9788', 'eval_sequential_score': '0.9799', 'eval_train_runtime': '1.008e+04', 'eval_train_samples_per_second': '112.1', 'eval_train_steps_per_second': '14.02', 'epoch': '1.554'} |
| {'loss': '0.4469', 'grad_norm': '3.364', 'learning_rate': '4.887e-06', 'epoch': '1.56'} |
| {'loss': '0.4602', 'grad_norm': '5.309', 'learning_rate': '4.815e-06', 'epoch': '1.567'} |
| {'loss': '0.443', 'grad_norm': '3.875', 'learning_rate': '4.743e-06', 'epoch': '1.573'} |
| {'loss': '0.4524', 'grad_norm': '4.824', 'learning_rate': '4.671e-06', 'epoch': '1.58'} |
| {'loss': '0.4528', 'grad_norm': '4.996', 'learning_rate': '4.599e-06', 'epoch': '1.586'} |
| {'loss': '0.4348', 'grad_norm': '4.96', 'learning_rate': '4.527e-06', 'epoch': '1.593'} |
| {'loss': '0.4533', 'grad_norm': '5.219', 'learning_rate': '4.455e-06', 'epoch': '1.599'} |
| {'loss': '0.4523', 'grad_norm': '3.444', 'learning_rate': '4.383e-06', 'epoch': '1.606'} |
| {'loss': '0.4509', 'grad_norm': '5.647', 'learning_rate': '4.311e-06', 'epoch': '1.612'} |
| {'loss': '0.4365', 'grad_norm': '5.052', 'learning_rate': '4.239e-06', 'epoch': '1.618'} |
| {'loss': '0.4504', 'grad_norm': '5.786', 'learning_rate': '4.167e-06', 'epoch': '1.625'} |
| {'loss': '0.4292', 'grad_norm': '4.353', 'learning_rate': '4.096e-06', 'epoch': '1.631'} |
| {'loss': '0.4406', 'grad_norm': '2.976', 'learning_rate': '4.024e-06', 'epoch': '1.638'} |
| {'loss': '0.4333', 'grad_norm': '3.685', 'learning_rate': '3.952e-06', 'epoch': '1.644'} |
| {'loss': '0.4361', 'grad_norm': '4.107', 'learning_rate': '3.88e-06', 'epoch': '1.651'} |
| {'loss': '0.4065', 'grad_norm': '3.636', 'learning_rate': '3.808e-06', 'epoch': '1.657'} |
| {'loss': '0.4671', 'grad_norm': '3.464', 'learning_rate': '3.736e-06', 'epoch': '1.664'} |
| {'loss': '0.4328', 'grad_norm': '3.129', 'learning_rate': '3.664e-06', 'epoch': '1.67'} |
| {'loss': '0.431', 'grad_norm': '2.453', 'learning_rate': '3.592e-06', 'epoch': '1.677'} |
| {'loss': '0.4523', 'grad_norm': '3.727', 'learning_rate': '3.52e-06', 'epoch': '1.683'} |
| {'loss': '0.4232', 'grad_norm': '4.398', 'learning_rate': '3.448e-06', 'epoch': '1.69'} |
| {'loss': '0.4257', 'grad_norm': '2.861', 'learning_rate': '3.376e-06', 'epoch': '1.696'} |
| {'loss': '0.4448', 'grad_norm': '3.523', 'learning_rate': '3.304e-06', 'epoch': '1.703'} |
| {'loss': '0.4491', 'grad_norm': '3.893', 'learning_rate': '3.232e-06', 'epoch': '1.709'} |
| {'loss': '0.4224', 'grad_norm': '3.399', 'learning_rate': '3.16e-06', 'epoch': '1.716'} |
| {'loss': '0.4297', 'grad_norm': '4.703', 'learning_rate': '3.088e-06', 'epoch': '1.722'} |
| {'loss': '0.4522', 'grad_norm': '4.29', 'learning_rate': '3.017e-06', 'epoch': '1.729'} |
| {'loss': '0.4195', 'grad_norm': '4.29', 'learning_rate': '2.945e-06', 'epoch': '1.735'} |
| {'loss': '0.4227', 'grad_norm': '3.841', 'learning_rate': '2.873e-06', 'epoch': '1.742'} |
| {'loss': '0.4381', 'grad_norm': '4.086', 'learning_rate': '2.801e-06', 'epoch': '1.748'} |
| {'eval_train_loss': '0.2875', 'eval_dev-768_cosine_accuracy': '0.9807', 'eval_dev-512_cosine_accuracy': '0.9808', 'eval_dev-256_cosine_accuracy': '0.9808', 'eval_dev-128_cosine_accuracy': '0.9805', 'eval_dev-64_cosine_accuracy': '0.9794', 'eval_sequential_score': '0.9807', 'eval_train_runtime': '1.012e+04', 'eval_train_samples_per_second': '111.6', 'eval_train_steps_per_second': '13.95', 'epoch': '1.748'} |
| {'loss': '0.446', 'grad_norm': '4.176', 'learning_rate': '2.729e-06', 'epoch': '1.754'} |
| {'loss': '0.426', 'grad_norm': '4.261', 'learning_rate': '2.657e-06', 'epoch': '1.761'} |
| {'loss': '0.4299', 'grad_norm': '4.676', 'learning_rate': '2.585e-06', 'epoch': '1.767'} |
| {'loss': '0.4247', 'grad_norm': '3.933', 'learning_rate': '2.513e-06', 'epoch': '1.774'} |
| {'loss': '0.4244', 'grad_norm': '4.853', 'learning_rate': '2.441e-06', 'epoch': '1.78'} |
| {'loss': '0.4185', 'grad_norm': '2.985', 'learning_rate': '2.369e-06', 'epoch': '1.787'} |
| {'loss': '0.4292', 'grad_norm': '3.804', 'learning_rate': '2.297e-06', 'epoch': '1.793'} |
| {'loss': '0.4468', 'grad_norm': '3.187', 'learning_rate': '2.225e-06', 'epoch': '1.8'} |
| {'loss': '0.4118', 'grad_norm': '4.004', 'learning_rate': '2.153e-06', 'epoch': '1.806'} |
| {'loss': '0.4306', 'grad_norm': '4.007', 'learning_rate': '2.081e-06', 'epoch': '1.813'} |
| {'loss': '0.4447', 'grad_norm': '4.323', 'learning_rate': '2.009e-06', 'epoch': '1.819'} |
| {'loss': '0.4147', 'grad_norm': '3.863', 'learning_rate': '1.938e-06', 'epoch': '1.826'} |
| {'loss': '0.4189', 'grad_norm': '4.788', 'learning_rate': '1.866e-06', 'epoch': '1.832'} |
| {'loss': '0.4167', 'grad_norm': '4.276', 'learning_rate': '1.794e-06', 'epoch': '1.839'} |
| {'loss': '0.4022', 'grad_norm': '3.887', 'learning_rate': '1.722e-06', 'epoch': '1.845'} |
| {'loss': '0.4158', 'grad_norm': '3.075', 'learning_rate': '1.65e-06', 'epoch': '1.852'} |
| {'loss': '0.4228', 'grad_norm': '3.993', 'learning_rate': '1.578e-06', 'epoch': '1.858'} |
| {'loss': '0.4256', 'grad_norm': '4.497', 'learning_rate': '1.506e-06', 'epoch': '1.865'} |
| {'loss': '0.4251', 'grad_norm': '4.539', 'learning_rate': '1.434e-06', 'epoch': '1.871'} |
| {'loss': '0.4232', 'grad_norm': '2.337', 'learning_rate': '1.362e-06', 'epoch': '1.877'} |
| {'loss': '0.4143', 'grad_norm': '3.389', 'learning_rate': '1.29e-06', 'epoch': '1.884'} |
| {'loss': '0.4331', 'grad_norm': '3.545', 'learning_rate': '1.218e-06', 'epoch': '1.89'} |
| {'loss': '0.4253', 'grad_norm': '5.606', 'learning_rate': '1.146e-06', 'epoch': '1.897'} |
| {'loss': '0.441', 'grad_norm': '4.453', 'learning_rate': '1.074e-06', 'epoch': '1.903'} |
| {'loss': '0.4337', 'grad_norm': '5.374', 'learning_rate': '1.002e-06', 'epoch': '1.91'} |
| {'loss': '0.4016', 'grad_norm': '2.246', 'learning_rate': '9.305e-07', 'epoch': '1.916'} |
| {'loss': '0.4249', 'grad_norm': '5.255', 'learning_rate': '8.585e-07', 'epoch': '1.923'} |
| {'loss': '0.4108', 'grad_norm': '3.59', 'learning_rate': '7.866e-07', 'epoch': '1.929'} |
| {'loss': '0.4272', 'grad_norm': '4.258', 'learning_rate': '7.147e-07', 'epoch': '1.936'} |
| {'loss': '0.3916', 'grad_norm': '3.476', 'learning_rate': '6.427e-07', 'epoch': '1.942'} |
| {'eval_train_loss': '0.2812', 'eval_dev-768_cosine_accuracy': '0.981', 'eval_dev-512_cosine_accuracy': '0.9811', 'eval_dev-256_cosine_accuracy': '0.9813', 'eval_dev-128_cosine_accuracy': '0.9811', 'eval_dev-64_cosine_accuracy': '0.9797', 'eval_sequential_score': '0.981', 'eval_train_runtime': '1.03e+04', 'eval_train_samples_per_second': '109.7', 'eval_train_steps_per_second': '13.71', 'epoch': '1.942'} |
| {'loss': '0.4334', 'grad_norm': '4.623', 'learning_rate': '5.708e-07', 'epoch': '1.949'} |
| {'loss': '0.4462', 'grad_norm': '5.31', 'learning_rate': '4.989e-07', 'epoch': '1.955'} |
| {'loss': '0.4436', 'grad_norm': '3.379', 'learning_rate': '4.269e-07', 'epoch': '1.962'} |
| {'loss': '0.4278', 'grad_norm': '5.471', 'learning_rate': '3.55e-07', 'epoch': '1.968'} |
| {'loss': '0.417', 'grad_norm': '3.435', 'learning_rate': '2.831e-07', 'epoch': '1.975'} |
| {'loss': '0.4376', 'grad_norm': '2.617', 'learning_rate': '2.111e-07', 'epoch': '1.981'} |
| {'loss': '0.4433', 'grad_norm': '3.465', 'learning_rate': '1.392e-07', 'epoch': '1.988'} |
| {'loss': '0.4292', 'grad_norm': '2.354', 'learning_rate': '6.726e-08', 'epoch': '1.994'} |
| {'train_runtime': '9.588e+04', 'train_samples_per_second': '82.48', 'train_steps_per_second': '0.644', 'train_loss': '0.403', 'epoch': '2'} |
| model saved successfully |
|
|