Initial upload
Browse files- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/partial_model_weights.pth +3 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_config.json +29 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_log.log +0 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/partial_model_weights.pth +3 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_config.json +29 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_log.log +0 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/partial_model_weights.pth +3 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_config.json +29 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_log.log +34 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/partial_model_weights.pth +3 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_config.json +29 -0
- checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_log.log +0 -0
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/partial_model_weights.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d69d3076cf36901714a712ef246c8f8aa8be34bc6c5d9aedb36b51c90e6cd90
|
| 3 |
+
size 1975288322
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"stage": "stage2",
|
| 3 |
+
"lr": 3e-05,
|
| 4 |
+
"epochs": 10,
|
| 5 |
+
"log_interval": 4,
|
| 6 |
+
"gradient_clip": 1.0,
|
| 7 |
+
"tr_batch_size": 4,
|
| 8 |
+
"te_batch_size": 4,
|
| 9 |
+
"gradient_accumulation_steps": 1,
|
| 10 |
+
"update_params": [
|
| 11 |
+
"all"
|
| 12 |
+
],
|
| 13 |
+
"corpus": "math_derivation",
|
| 14 |
+
"num_of_sents": [
|
| 15 |
+
12,
|
| 16 |
+
12
|
| 17 |
+
],
|
| 18 |
+
"encoder": "bert-base-cased",
|
| 19 |
+
"repeat": 1,
|
| 20 |
+
"max_num_each_cat": 2000,
|
| 21 |
+
"fb_mode": 0.0,
|
| 22 |
+
"set_loss_mask": false,
|
| 23 |
+
"use_label_dec": true,
|
| 24 |
+
"use_label_enc": false,
|
| 25 |
+
"decoder": "Qwen/Qwen2.5-0.5B",
|
| 26 |
+
"pretrained_path": null,
|
| 27 |
+
"device": "cuda",
|
| 28 |
+
"save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000"
|
| 29 |
+
}
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_log.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/partial_model_weights.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9653bb2737dacaff5ad743be55f8f68ef2cda481c11b7a2555407d52656c32f0
|
| 3 |
+
size 1975288322
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"stage": "stage2",
|
| 3 |
+
"lr": 3e-05,
|
| 4 |
+
"epochs": 10,
|
| 5 |
+
"log_interval": 4,
|
| 6 |
+
"gradient_clip": 1.0,
|
| 7 |
+
"tr_batch_size": 4,
|
| 8 |
+
"te_batch_size": 4,
|
| 9 |
+
"gradient_accumulation_steps": 1,
|
| 10 |
+
"update_params": [
|
| 11 |
+
"all"
|
| 12 |
+
],
|
| 13 |
+
"corpus": "math_derivation",
|
| 14 |
+
"num_of_sents": [
|
| 15 |
+
12,
|
| 16 |
+
12
|
| 17 |
+
],
|
| 18 |
+
"encoder": "bert-base-cased",
|
| 19 |
+
"repeat": 1,
|
| 20 |
+
"max_num_each_cat": 4000,
|
| 21 |
+
"fb_mode": 0.0,
|
| 22 |
+
"set_loss_mask": false,
|
| 23 |
+
"use_label_dec": true,
|
| 24 |
+
"use_label_enc": false,
|
| 25 |
+
"decoder": "Qwen/Qwen2.5-0.5B",
|
| 26 |
+
"pretrained_path": null,
|
| 27 |
+
"device": "cuda",
|
| 28 |
+
"save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000"
|
| 29 |
+
}
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_log.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/partial_model_weights.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb3386a8e754c0aad746d44a1046ee21909841de1c636636942db8a3e53f5b5f
|
| 3 |
+
size 1975288322
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"stage": "stage2",
|
| 3 |
+
"lr": 3e-05,
|
| 4 |
+
"epochs": 10,
|
| 5 |
+
"log_interval": 4,
|
| 6 |
+
"gradient_clip": 1.0,
|
| 7 |
+
"tr_batch_size": 4,
|
| 8 |
+
"te_batch_size": 4,
|
| 9 |
+
"gradient_accumulation_steps": 1,
|
| 10 |
+
"update_params": [
|
| 11 |
+
"all"
|
| 12 |
+
],
|
| 13 |
+
"corpus": "math_derivation",
|
| 14 |
+
"num_of_sents": [
|
| 15 |
+
12,
|
| 16 |
+
12
|
| 17 |
+
],
|
| 18 |
+
"encoder": "bert-base-cased",
|
| 19 |
+
"repeat": 1,
|
| 20 |
+
"max_num_each_cat": 500,
|
| 21 |
+
"fb_mode": 0.0,
|
| 22 |
+
"set_loss_mask": false,
|
| 23 |
+
"use_label_dec": true,
|
| 24 |
+
"use_label_enc": false,
|
| 25 |
+
"decoder": "Qwen/Qwen2.5-0.5B",
|
| 26 |
+
"pretrained_path": null,
|
| 27 |
+
"device": "cuda",
|
| 28 |
+
"save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500"
|
| 29 |
+
}
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_log.log
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
* training corpus: math_derivation
|
| 2 |
+
* total num: 3500
|
| 3 |
+
* epochs: 10
|
| 4 |
+
* batch size: 4
|
| 5 |
+
* gradient_accumulation_steps: 1
|
| 6 |
+
--------------------------------------------------------------------------------
|
| 7 |
+
| epoch 1 | 3/ 700 batches | train loss 2.2737865
|
| 8 |
+
| epoch 1 | 7/ 700 batches | train loss 1.8909767
|
| 9 |
+
| epoch 1 | 11/ 700 batches | train loss 2.2367806
|
| 10 |
+
| epoch 1 | 15/ 700 batches | train loss 1.4953537
|
| 11 |
+
| epoch 1 | 19/ 700 batches | train loss 1.0802276
|
| 12 |
+
| epoch 1 | 23/ 700 batches | train loss 1.0777125
|
| 13 |
+
| epoch 1 | 27/ 700 batches | train loss 0.9398872
|
| 14 |
+
| epoch 1 | 31/ 700 batches | train loss 0.7538413
|
| 15 |
+
| epoch 1 | 35/ 700 batches | train loss 0.7569203
|
| 16 |
+
| epoch 1 | 39/ 700 batches | train loss 0.7991652
|
| 17 |
+
| epoch 1 | 43/ 700 batches | train loss 0.6284156
|
| 18 |
+
| epoch 1 | 47/ 700 batches | train loss 0.5409176
|
| 19 |
+
| epoch 1 | 51/ 700 batches | train loss 0.6084326
|
| 20 |
+
| epoch 1 | 55/ 700 batches | train loss 0.6365047
|
| 21 |
+
| epoch 1 | 59/ 700 batches | train loss 0.6892266
|
| 22 |
+
| epoch 1 | 63/ 700 batches | train loss 0.5623323
|
| 23 |
+
| epoch 1 | 67/ 700 batches | train loss 0.5938894
|
| 24 |
+
| epoch 1 | 71/ 700 batches | train loss 0.6103931
|
| 25 |
+
| epoch 1 | 75/ 700 batches | train loss 0.6170006
|
| 26 |
+
| epoch 1 | 79/ 700 batches | train loss 0.5508593
|
| 27 |
+
| epoch 1 | 83/ 700 batches | train loss 0.5028061
|
| 28 |
+
| epoch 1 | 87/ 700 batches | train loss 0.8172022
|
| 29 |
+
| epoch 1 | 91/ 700 batches | train loss 0.5984362
|
| 30 |
+
| epoch 1 | 95/ 700 batches | train loss 0.7497207
|
| 31 |
+
| epoch 1 | 99/ 700 batches | train loss 0.6599419
|
| 32 |
+
| epoch 1 | 103/ 700 batches | train loss 0.5675180
|
| 33 |
+
| epoch 1 | 107/ 700 batches | train loss 0.4801010
|
| 34 |
+
| epoch 1 | 111/ 700 batches | train loss 0.4890854
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/partial_model_weights.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:016f6a274e4b488d730f635259f2c68cbd867459fd7430cdb02e05ca18eeb186
|
| 3 |
+
size 1975288322
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"stage": "stage2",
|
| 3 |
+
"lr": 3e-05,
|
| 4 |
+
"epochs": 10,
|
| 5 |
+
"log_interval": 4,
|
| 6 |
+
"gradient_clip": 1.0,
|
| 7 |
+
"tr_batch_size": 4,
|
| 8 |
+
"te_batch_size": 4,
|
| 9 |
+
"gradient_accumulation_steps": 1,
|
| 10 |
+
"update_params": [
|
| 11 |
+
"all"
|
| 12 |
+
],
|
| 13 |
+
"corpus": "math_derivation",
|
| 14 |
+
"num_of_sents": [
|
| 15 |
+
12,
|
| 16 |
+
12
|
| 17 |
+
],
|
| 18 |
+
"encoder": "bert-base-cased",
|
| 19 |
+
"repeat": 1,
|
| 20 |
+
"max_num_each_cat": 6000,
|
| 21 |
+
"fb_mode": 0.0,
|
| 22 |
+
"set_loss_mask": false,
|
| 23 |
+
"use_label_dec": true,
|
| 24 |
+
"use_label_enc": false,
|
| 25 |
+
"decoder": "Qwen/Qwen2.5-0.5B",
|
| 26 |
+
"pretrained_path": null,
|
| 27 |
+
"device": "cuda",
|
| 28 |
+
"save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000"
|
| 29 |
+
}
|
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_log.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|