YZHANG32 commited on
Commit
2ec7640
·
verified ·
1 Parent(s): 6202379

Initial upload

Browse files
Files changed (12) hide show
  1. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/partial_model_weights.pth +3 -0
  2. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_config.json +29 -0
  3. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_log.log +0 -0
  4. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/partial_model_weights.pth +3 -0
  5. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_config.json +29 -0
  6. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_log.log +0 -0
  7. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/partial_model_weights.pth +3 -0
  8. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_config.json +29 -0
  9. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_log.log +34 -0
  10. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/partial_model_weights.pth +3 -0
  11. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_config.json +29 -0
  12. checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_log.log +0 -0
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/partial_model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d69d3076cf36901714a712ef246c8f8aa8be34bc6c5d9aedb36b51c90e6cd90
3
+ size 1975288322
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "stage2",
3
+ "lr": 3e-05,
4
+ "epochs": 10,
5
+ "log_interval": 4,
6
+ "gradient_clip": 1.0,
7
+ "tr_batch_size": 4,
8
+ "te_batch_size": 4,
9
+ "gradient_accumulation_steps": 1,
10
+ "update_params": [
11
+ "all"
12
+ ],
13
+ "corpus": "math_derivation",
14
+ "num_of_sents": [
15
+ 12,
16
+ 12
17
+ ],
18
+ "encoder": "bert-base-cased",
19
+ "repeat": 1,
20
+ "max_num_each_cat": 2000,
21
+ "fb_mode": 0.0,
22
+ "set_loss_mask": false,
23
+ "use_label_dec": true,
24
+ "use_label_enc": false,
25
+ "decoder": "Qwen/Qwen2.5-0.5B",
26
+ "pretrained_path": null,
27
+ "device": "cuda",
28
+ "save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000"
29
+ }
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_2000/train_log.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/partial_model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9653bb2737dacaff5ad743be55f8f68ef2cda481c11b7a2555407d52656c32f0
3
+ size 1975288322
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "stage2",
3
+ "lr": 3e-05,
4
+ "epochs": 10,
5
+ "log_interval": 4,
6
+ "gradient_clip": 1.0,
7
+ "tr_batch_size": 4,
8
+ "te_batch_size": 4,
9
+ "gradient_accumulation_steps": 1,
10
+ "update_params": [
11
+ "all"
12
+ ],
13
+ "corpus": "math_derivation",
14
+ "num_of_sents": [
15
+ 12,
16
+ 12
17
+ ],
18
+ "encoder": "bert-base-cased",
19
+ "repeat": 1,
20
+ "max_num_each_cat": 4000,
21
+ "fb_mode": 0.0,
22
+ "set_loss_mask": false,
23
+ "use_label_dec": true,
24
+ "use_label_enc": false,
25
+ "decoder": "Qwen/Qwen2.5-0.5B",
26
+ "pretrained_path": null,
27
+ "device": "cuda",
28
+ "save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000"
29
+ }
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_4000/train_log.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/partial_model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb3386a8e754c0aad746d44a1046ee21909841de1c636636942db8a3e53f5b5f
3
+ size 1975288322
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "stage2",
3
+ "lr": 3e-05,
4
+ "epochs": 10,
5
+ "log_interval": 4,
6
+ "gradient_clip": 1.0,
7
+ "tr_batch_size": 4,
8
+ "te_batch_size": 4,
9
+ "gradient_accumulation_steps": 1,
10
+ "update_params": [
11
+ "all"
12
+ ],
13
+ "corpus": "math_derivation",
14
+ "num_of_sents": [
15
+ 12,
16
+ 12
17
+ ],
18
+ "encoder": "bert-base-cased",
19
+ "repeat": 1,
20
+ "max_num_each_cat": 500,
21
+ "fb_mode": 0.0,
22
+ "set_loss_mask": false,
23
+ "use_label_dec": true,
24
+ "use_label_enc": false,
25
+ "decoder": "Qwen/Qwen2.5-0.5B",
26
+ "pretrained_path": null,
27
+ "device": "cuda",
28
+ "save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500"
29
+ }
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_500/train_log.log ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * training corpus: math_derivation
2
+ * total num: 3500
3
+ * epochs: 10
4
+ * batch size: 4
5
+ * gradient_accumulation_steps: 1
6
+ --------------------------------------------------------------------------------
7
+ | epoch 1 | 3/ 700 batches | train loss 2.2737865
8
+ | epoch 1 | 7/ 700 batches | train loss 1.8909767
9
+ | epoch 1 | 11/ 700 batches | train loss 2.2367806
10
+ | epoch 1 | 15/ 700 batches | train loss 1.4953537
11
+ | epoch 1 | 19/ 700 batches | train loss 1.0802276
12
+ | epoch 1 | 23/ 700 batches | train loss 1.0777125
13
+ | epoch 1 | 27/ 700 batches | train loss 0.9398872
14
+ | epoch 1 | 31/ 700 batches | train loss 0.7538413
15
+ | epoch 1 | 35/ 700 batches | train loss 0.7569203
16
+ | epoch 1 | 39/ 700 batches | train loss 0.7991652
17
+ | epoch 1 | 43/ 700 batches | train loss 0.6284156
18
+ | epoch 1 | 47/ 700 batches | train loss 0.5409176
19
+ | epoch 1 | 51/ 700 batches | train loss 0.6084326
20
+ | epoch 1 | 55/ 700 batches | train loss 0.6365047
21
+ | epoch 1 | 59/ 700 batches | train loss 0.6892266
22
+ | epoch 1 | 63/ 700 batches | train loss 0.5623323
23
+ | epoch 1 | 67/ 700 batches | train loss 0.5938894
24
+ | epoch 1 | 71/ 700 batches | train loss 0.6103931
25
+ | epoch 1 | 75/ 700 batches | train loss 0.6170006
26
+ | epoch 1 | 79/ 700 batches | train loss 0.5508593
27
+ | epoch 1 | 83/ 700 batches | train loss 0.5028061
28
+ | epoch 1 | 87/ 700 batches | train loss 0.8172022
29
+ | epoch 1 | 91/ 700 batches | train loss 0.5984362
30
+ | epoch 1 | 95/ 700 batches | train loss 0.7497207
31
+ | epoch 1 | 99/ 700 batches | train loss 0.6599419
32
+ | epoch 1 | 103/ 700 batches | train loss 0.5675180
33
+ | epoch 1 | 107/ 700 batches | train loss 0.4801010
34
+ | epoch 1 | 111/ 700 batches | train loss 0.4890854
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/partial_model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:016f6a274e4b488d730f635259f2c68cbd867459fd7430cdb02e05ca18eeb186
3
+ size 1975288322
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "stage2",
3
+ "lr": 3e-05,
4
+ "epochs": 10,
5
+ "log_interval": 4,
6
+ "gradient_clip": 1.0,
7
+ "tr_batch_size": 4,
8
+ "te_batch_size": 4,
9
+ "gradient_accumulation_steps": 1,
10
+ "update_params": [
11
+ "all"
12
+ ],
13
+ "corpus": "math_derivation",
14
+ "num_of_sents": [
15
+ 12,
16
+ 12
17
+ ],
18
+ "encoder": "bert-base-cased",
19
+ "repeat": 1,
20
+ "max_num_each_cat": 6000,
21
+ "fb_mode": 0.0,
22
+ "set_loss_mask": false,
23
+ "use_label_dec": true,
24
+ "use_label_enc": false,
25
+ "decoder": "Qwen/Qwen2.5-0.5B",
26
+ "pretrained_path": null,
27
+ "device": "cuda",
28
+ "save_dir": "checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000"
29
+ }
checkpoint_dec_Qwen-Qwen2.5-0.5B_epochs_10_corpus_math_derivation_stage_stage2_num_12_use_label_dec_True_max_num_each_cat_6000/train_log.log ADDED
The diff for this file is too large to render. See raw diff