nbtpj commited on
Commit
d0258f9
·
verified ·
1 Parent(s): 87509f9

Upload best model checkpoint

Browse files
accelerator.ckpt/dl_state_dict.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d2567898e7971a0b5670bf6a2737bc31f98cc950fb4130fe8d746054d9f1be3
3
+ size 1489
accelerator.ckpt/dl_state_dict_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43919dbdf599450386720f61a25bad24d90eca951c25271e500df587378f1abf
3
+ size 1501
accelerator.ckpt/dl_state_dict_2.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d18bcd689dbb471d486cfc1fcae82e7fb210ffb37bd6172295a630d9fea40b9c
3
+ size 1501
accelerator.ckpt/dl_state_dict_3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7071a9b7092627c45be3df7796c8387732b699abc24c431c8d8abe771d2f1cdb
3
+ size 1501
accelerator.ckpt/dl_state_dict_4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c852d5f1a0b46f769dceae5ed0350bbebda4ae59f4d937518014c7ffd8c39db
3
+ size 1501
accelerator.ckpt/dl_state_dict_5.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e368128de79e58e8060f0c777fb7be2bf456de524f4e7a86faf5ccaf21d3d3e9
3
+ size 1501
accelerator.ckpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72cb730d48a4ebb7c171131201d4ef436a363e29f3612a1bef070eb443d27540
3
+ size 497774208
accelerator.ckpt/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf5be8ddaa821b05c94549412b54654e98f21c54385c65705aaa6a10cbd0ea6
3
+ size 995606091
accelerator.ckpt/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c2096e5fe9ed800834e5062b80cf17dbcc2e3a970c7f22cab692e6840cd078a
3
+ size 14757
accelerator.ckpt/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca27db8230fb5ac7b02317d1adc87bf3ba1bc5dd9063fe15c2d4bbad81e79932
3
+ size 1465
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.57.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.57.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
metrics.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "rl_info/A2G": -2.228810262749903e-05,
3
+ "rl_info/entropy": 3.103461980819702,
4
+ "rl_info/total_token": 3872.0,
5
+ "rl_info/advantage_b4_norm": -847.2278442382812,
6
+ "rl_info/advantage_after_gnorm": 0.2974321246147156,
7
+ "rl_info/kl_w_ref": 0.0,
8
+ "train/rl_loss": 0.0019184639677405357,
9
+ "train/total_loss": 0.0019184639677405357,
10
+ "gigaword/rouge1": 0.017424667842673495,
11
+ "gigaword/rouge2": 0.002778778179571005,
12
+ "gigaword/rougeL": 0.01708351188579019,
13
+ "gigaword/rougeLsum": 0.01597830557868938,
14
+ "gigaword/bertscore_precision": 0.5688383205235005,
15
+ "gigaword/bertscore_recall": 0.6489311541616917,
16
+ "gigaword/bertscore_f1": 0.6054284919798374,
17
+ "cnndm/rouge1": 0.1657242795787633,
18
+ "cnndm/rouge2": 0.05944159953280759,
19
+ "cnndm/rougeL": 0.13357654072383415,
20
+ "cnndm/rougeLsum": 0.14685147194336262,
21
+ "cnndm/bertscore_precision": 0.6781402329603831,
22
+ "cnndm/bertscore_recall": 0.7525439510742823,
23
+ "cnndm/bertscore_f1": 0.7120969245831171,
24
+ "xsum/rouge1": 0.12156545540546186,
25
+ "xsum/rouge2": 0.01235304893841479,
26
+ "xsum/rougeL": 0.08560314826140131,
27
+ "xsum/rougeLsum": 0.09009432870268685,
28
+ "xsum/bertscore_precision": 0.6957823038101196,
29
+ "xsum/bertscore_recall": 0.7223540594180425,
30
+ "xsum/bertscore_f1": 0.7064366390307745,
31
+ "samsum/rouge1": 0.08285230780746375,
32
+ "samsum/rouge2": 0.021593044513516627,
33
+ "samsum/rougeL": 0.07018645398318014,
34
+ "samsum/rougeLsum": 0.057873523121751924,
35
+ "samsum/bertscore_precision": 0.6268573751052221,
36
+ "samsum/bertscore_recall": 0.6982430865367254,
37
+ "samsum/bertscore_f1": 0.6594596952199936,
38
+ "eval_agg/avg_all_rougef": 0.06881127912496056,
39
+ "eval_agg/avg_all_bertf": 0.6708554377034307,
40
+ "eval_agg/avg_all": 0.3698333584141956,
41
+ "num_rl_rollout": 10,
42
+ "lm_epoch": 0,
43
+ "rl_epoch": 0,
44
+ "step": 400,
45
+ "total_data_token": 109047,
46
+ "total_rl_token": 1767730,
47
+ "total_lm_token": 0,
48
+ "total_token": 1767730,
49
+ "completed_steps": 400,
50
+ "tune_objective": 0.8896403575001602
51
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72cb730d48a4ebb7c171131201d4ef436a363e29f3612a1bef070eb443d27540
3
+ size 497774208
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
train_configs.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_name": "nbtpj/summ_ds_train",
3
+ "dataset_config_name": null,
4
+ "train_split_name": "merge36_cnndmsamsumxsum",
5
+ "text_col": "text",
6
+ "freeze_role2": false,
7
+ "only_train_role1": false,
8
+ "model_name_or_path": "gpt2",
9
+ "ref_role1_name_or_path": "gpt2",
10
+ "ref_role2_name_or_path": "gpt2",
11
+ "pretrained_role2_name_or_path": "none",
12
+ "config_name": null,
13
+ "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib",
14
+ "tokenizer_name": null,
15
+ "use_slow_tokenizer": false,
16
+ "per_device_train_batch_size": 4,
17
+ "per_device_query_rollout_batch_size": 32,
18
+ "per_device_eval_batch_size": 4,
19
+ "vllm_vram_ratio": 0.3,
20
+ "learning_rate": 3e-07,
21
+ "grad_norm": 0.5,
22
+ "weight_decay": 1e-05,
23
+ "max_train_steps": 40000,
24
+ "max_train_rollouts": 100000,
25
+ "gradient_accumulation_steps": 1,
26
+ "lr_scheduler_type": "constant",
27
+ "num_warmup_steps": 200,
28
+ "seed": 0,
29
+ "model_type": null,
30
+ "block_size": 1024,
31
+ "mini_epoch": 1,
32
+ "rollout_game": "baseline3v2",
33
+ "rl_algo": "on_policy",
34
+ "constraint_type": "kl",
35
+ "clamp_update": false,
36
+ "rl_w": 1.0,
37
+ "lm_w": 0.0,
38
+ "n_generate": 4,
39
+ "n_augment": 0,
40
+ "gradient_checkpoint": false,
41
+ "group_relative_norm": true,
42
+ "sample_config": {
43
+ "do_sample": true,
44
+ "min_new_tokens": 20,
45
+ "temperature": 1.0
46
+ },
47
+ "inference_config": {
48
+ "do_sample": true,
49
+ "temperature": 0.0,
50
+ "min_new_tokens": 5,
51
+ "max_new_tokens": 250
52
+ },
53
+ "rollout_config": {
54
+ "accuracy_w": 60.07249475906205,
55
+ "len_pen": 1.0,
56
+ "accuracy_w2": 1.2229065947034368,
57
+ "len_pen2": 1.0,
58
+ "threshold": 0.006750312521595928,
59
+ "similarity_fn": "rouge"
60
+ },
61
+ "ent_coef": 0.0001,
62
+ "beta_coef": "0.0",
63
+ "prompt_0": "{text}",
64
+ "prompt_1": "{text}\nTL;DR: ",
65
+ "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:",
66
+ "prompt_eval": "{text}\nTL;DR:",
67
+ "epsilon": 0.2,
68
+ "a2g_norm": true,
69
+ "vllm_sleep": true,
70
+ "lora": false,
71
+ "need_attn_mask": true,
72
+ "gamma": 0.95,
73
+ "trust_remote_code": true,
74
+ "test_glue": false,
75
+ "test_clm": false,
76
+ "causal_model": true,
77
+ "test_gen": true,
78
+ "log_rollout_txt": true,
79
+ "trunc_eval": 256,
80
+ "trunc_evals": [
81
+ "cnndm___12",
82
+ "samsum___12",
83
+ "xsum___12",
84
+ "gigaword___200",
85
+ "duc___50"
86
+ ],
87
+ "use_deepspeed": false,
88
+ "zero_config": 2,
89
+ "log_interval": "5m",
90
+ "eval_interval": "100",
91
+ "checkpoint_interval": "100",
92
+ "lm_fraction": -1.0,
93
+ "push_to_hub": null,
94
+ "keep_eval_size": false,
95
+ "mixed_precision": "bf16",
96
+ "tune_metrics": [
97
+ "cnndm/rouge1___1.0",
98
+ "cnndm/bertscore_f1___0.25",
99
+ "samsum/rouge1___1.0",
100
+ "samsum/bertscore_f1___0.25",
101
+ "xsum/rouge1___1.0",
102
+ "xsum/bertscore_f1___0.25"
103
+ ],
104
+ "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2",
105
+ "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py",
106
+ "train_from_raw": true,
107
+ "world_size": 1,
108
+ "cpu_per_worker": 7,
109
+ "gpu_per_worker": 1
110
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff