Kevin3777 commited on
Commit
376db5c
·
verified ·
1 Parent(s): 4d4a22a

Upload folder using huggingface_hub

Browse files
checkpoints/ep5-ba660-rank0.pt.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fd53119b6054e52c071ff7fbb88f7e96a0e0452090629628ed6f6acebf9295
3
+ size 13200865280
checkpoints/latest-rank0.pt.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fd53119b6054e52c071ff7fbb88f7e96a0e0452090629628ed6f6acebf9295
3
+ size 13200865280
checkpoints/train_config.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ algorithms:
2
+ gradient_clipping:
3
+ clipping_threshold: 1.0
4
+ clipping_type: norm
5
+ callbacks:
6
+ lr_monitor: {}
7
+ memory_monitor: {}
8
+ runtime_estimator: {}
9
+ speed_monitor:
10
+ window_size: 10
11
+ console_log_interval: 50ba
12
+ cross_doc_attention: false
13
+ dataloaders:
14
+ - dataset:
15
+ batch_type: lm
16
+ local: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/
17
+ max_seq_len: 2048
18
+ shuffle: true
19
+ split: train
20
+ drop_last: false
21
+ name: train_loader_docs
22
+ num_workers: 0
23
+ - dataset:
24
+ batch_type: fact
25
+ local: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/qa
26
+ masking:
27
+ cross_doc_attention: false
28
+ max_seq_len: 2048
29
+ shuffle: true
30
+ split: qa_attribution_train
31
+ drop_last: false
32
+ name: train_q_a_url
33
+ num_workers: 0
34
+ deepspeed_config:
35
+ bf16:
36
+ enabled: true
37
+ train_batch_size: 80
38
+ zero_optimization:
39
+ allgather_bucket_size: 200000000.0
40
+ contiguous_gradients: true
41
+ offload_optimizer:
42
+ device: cpu
43
+ pin_memory: true
44
+ overlap_comm: true
45
+ reduce_bucket_size: true
46
+ reduce_scatter: true
47
+ stage: 3
48
+ device_eval_batch_size: 40
49
+ device_train_microbatch_size: 2
50
+ eval_first: false
51
+ eval_interval: 1
52
+ eval_subset_num_batches: -1
53
+ experiment:
54
+ data:
55
+ augment:
56
+ doc:
57
+ do: false
58
+ method: permute
59
+ n_sample_per_doc: 2
60
+ finetune:
61
+ neg_create_probability: 0.0
62
+ number_non_attributable_negatives: 0
63
+ qa_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours
64
+ text_data_path: dataset/ours/pretrain
65
+ train_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours/pretrain/train
66
+ eval:
67
+ disable_all_eval: true
68
+ disable_attribution_eval: true
69
+ disable_non_attrib_eval: true
70
+ disable_qa_eval: true
71
+ icl_eval: false
72
+ ppl_eval: false
73
+ use_ais: false
74
+ experiment:
75
+ name: arxiv-citation-doc-id-begin
76
+ output_dir: outputs/experiments/
77
+ model:
78
+ checkpoint: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints/latest-rank0.pt.tar
79
+ ckpt_dir: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
80
+ name: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
81
+ train:
82
+ config_template_path: conf/templates/train_config.yaml
83
+ cross_doc_attention: false
84
+ device_eval_batch_size: 40
85
+ device_train_microbatch_size: 2
86
+ eval_first: false
87
+ finetune_q_a: false
88
+ finetune_q_a_doc_url: false
89
+ finetune_q_a_url: true
90
+ finetune_q_url_a: false
91
+ loss_type: mask
92
+ lr: 8.0e-05
93
+ max_duration: 5ep
94
+ pretrain: true
95
+ q_a_url_predict_url_only: false
96
+ repeat_url_across_doc: false
97
+ save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
98
+ sequential: false
99
+ url_location: no_url
100
+ url_loss_factor: 1.0
101
+ weight_decay: 0.02
102
+ global_seed: 17
103
+ global_train_batch_size: 80
104
+ log_to_console: true
105
+ loggers:
106
+ wandb:
107
+ project: intrinsic-source-citation
108
+ max_duration: 5ep
109
+ max_seq_len: 2048
110
+ model:
111
+ ckpt_dir: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
112
+ loss:
113
+ type: mask
114
+ url_loss_factor: 1.0
115
+ name: hf_causal_lm
116
+ pretrained: true
117
+ pretrained_model_name_or_path: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
118
+ ood_url_trie: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/unseen_url_trie.pkl
119
+ optimizer:
120
+ betas:
121
+ - 0.9
122
+ - 0.98
123
+ eps: 1.0e-06
124
+ lr: 8.0e-05
125
+ name: deepspeed_adam
126
+ weight_decay: 0.02
127
+ precision: amp_bf16
128
+ progress_bar: false
129
+ run_name: arxiv-citation-doc-id-begin
130
+ save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
131
+ save_interval: 1ep
132
+ save_num_checkpoints_to_keep: 1
133
+ scheduler:
134
+ alpha_f: 0.1
135
+ name: linear_decay_with_warmup
136
+ t_warmup: 1ep
137
+ seed: 17
138
+ streaming: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/
139
+ text_data_path: dataset/ours/pretrain
140
+ tokenizer:
141
+ kwargs:
142
+ model_max_length: 2048
143
+ name: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming//tokenizer
144
+ tokenizer_name: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming//tokenizer
145
+ url_trie: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/url_trie.pkl
146
+ dist_timeout: 600.0
147
+ n_gpus: 1
148
+ device_train_batch_size: 80
149
+ device_train_grad_accum: 40
150
+ n_params: 1100056576
data/streaming/qa/qa_attribution_train/shard.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb386042ae42c9046ed625fb9325cf6e7257670bbd774c86c396a1451944e0ea
3
+ size 84003105
data/streaming/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</url>": 32001,
3
+ "<url>": 32000
4
+ }
data/streaming/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<url>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</url>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": {
19
+ "content": "<s>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "eos_token": {
26
+ "content": "</s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "pad_token": "</s>",
33
+ "unk_token": {
34
+ "content": "<unk>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ }
40
+ }
data/streaming/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
data/streaming/tokenizer/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
data/streaming/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "32000": {
28
+ "content": "<url>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "32001": {
36
+ "content": "</url>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": [
45
+ "<url>",
46
+ "</url>"
47
+ ],
48
+ "bos_token": "<s>",
49
+ "clean_up_tokenization_spaces": false,
50
+ "eos_token": "</s>",
51
+ "legacy": false,
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "</s>",
54
+ "padding_side": "right",
55
+ "sp_model_kwargs": {},
56
+ "tokenizer_class": "LlamaTokenizer",
57
+ "unk_token": "<unk>",
58
+ "use_default_system_prompt": true
59
+ }
data/streaming/train/shard.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:611d90f3ee5a2a6f7143c3382831f11b5232feb8da989d4667d3a10f7425c4d2
3
+ size 75663055
data/streaming/unseen_url_trie.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e397f686e6d6595bfdc7f101f4e0e1e0402b78331e058ec31332dcdefff82ae
3
+ size 5018418
data/streaming/url_trie.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48ef08365be9d614aaae9c7fb1e5db648a8b3dc33060e9129a0d3738bc3dc61a
3
+ size 5021235
experiment_config.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ augment:
3
+ doc:
4
+ do: false
5
+ method: permute
6
+ n_sample_per_doc: 2
7
+ finetune:
8
+ neg_create_probability: 0.0
9
+ number_non_attributable_negatives: 0
10
+ qa_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours
11
+ text_data_path: dataset/ours/pretrain
12
+ train_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours/pretrain/train
13
+ eval:
14
+ disable_all_eval: true
15
+ disable_attribution_eval: true
16
+ disable_non_attrib_eval: true
17
+ disable_qa_eval: true
18
+ icl_eval: false
19
+ ppl_eval: false
20
+ use_ais: false
21
+ experiment:
22
+ name: arxiv-citation-doc-id-begin
23
+ output_dir: outputs/experiments/
24
+ model:
25
+ checkpoint: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints/latest-rank0.pt.tar
26
+ ckpt_dir: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
27
+ name: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
28
+ train:
29
+ config_template_path: conf/templates/train_config.yaml
30
+ cross_doc_attention: false
31
+ device_eval_batch_size: 40
32
+ device_train_microbatch_size: 2
33
+ eval_first: false
34
+ finetune_q_a: false
35
+ finetune_q_a_doc_url: false
36
+ finetune_q_a_url: true
37
+ finetune_q_url_a: false
38
+ loss_type: mask
39
+ lr: 8.0e-05
40
+ max_duration: 5ep
41
+ pretrain: true
42
+ q_a_url_predict_url_only: false
43
+ repeat_url_across_doc: false
44
+ save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
45
+ sequential: false
46
+ url_location: no_url
47
+ url_loss_factor: 1.0
48
+ weight_decay: 0.02
train_config.yaml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ algorithms:
2
+ gradient_clipping:
3
+ clipping_threshold: 1.0
4
+ clipping_type: norm
5
+ callbacks:
6
+ lr_monitor: {}
7
+ memory_monitor: {}
8
+ runtime_estimator: {}
9
+ speed_monitor:
10
+ window_size: 10
11
+ console_log_interval: 50ba
12
+ cross_doc_attention: false
13
+ dataloaders:
14
+ - dataset:
15
+ batch_type: lm
16
+ local: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/
17
+ masking:
18
+ cross_doc_attention: false
19
+ max_seq_len: 2048
20
+ shuffle: true
21
+ split: train
22
+ drop_last: false
23
+ name: train_loader_docs
24
+ num_workers: 0
25
+ - dataset:
26
+ batch_type: fact
27
+ local: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/qa
28
+ masking:
29
+ cross_doc_attention: false
30
+ max_seq_len: 2048
31
+ shuffle: true
32
+ split: qa_attribution_train
33
+ drop_last: false
34
+ name: train_q_a_url
35
+ num_workers: 0
36
+ deepspeed_config:
37
+ bf16:
38
+ enabled: true
39
+ train_batch_size: 80
40
+ zero_optimization:
41
+ allgather_bucket_size: 200000000.0
42
+ contiguous_gradients: true
43
+ offload_optimizer:
44
+ device: cpu
45
+ pin_memory: true
46
+ overlap_comm: true
47
+ reduce_bucket_size: true
48
+ reduce_scatter: true
49
+ stage: 3
50
+ device_eval_batch_size: 40
51
+ device_train_microbatch_size: 2
52
+ eval_first: false
53
+ eval_interval: 1
54
+ eval_subset_num_batches: -1
55
+ experiment:
56
+ data:
57
+ augment:
58
+ doc:
59
+ do: false
60
+ method: permute
61
+ n_sample_per_doc: 2
62
+ finetune:
63
+ neg_create_probability: 0.0
64
+ number_non_attributable_negatives: 0
65
+ qa_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours
66
+ text_data_path: dataset/ours/pretrain
67
+ train_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours/pretrain/train
68
+ eval:
69
+ disable_all_eval: true
70
+ disable_attribution_eval: true
71
+ disable_non_attrib_eval: true
72
+ disable_qa_eval: true
73
+ icl_eval: false
74
+ ppl_eval: false
75
+ use_ais: false
76
+ experiment:
77
+ name: arxiv-citation-doc-id-begin
78
+ output_dir: outputs/experiments/
79
+ model:
80
+ checkpoint: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints/latest-rank0.pt.tar
81
+ ckpt_dir: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
82
+ name: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
83
+ train:
84
+ config_template_path: conf/templates/train_config.yaml
85
+ cross_doc_attention: false
86
+ device_eval_batch_size: 40
87
+ device_train_microbatch_size: 2
88
+ eval_first: false
89
+ finetune_q_a: false
90
+ finetune_q_a_doc_url: false
91
+ finetune_q_a_url: true
92
+ finetune_q_url_a: false
93
+ loss_type: mask
94
+ lr: 8.0e-05
95
+ max_duration: 5ep
96
+ pretrain: true
97
+ q_a_url_predict_url_only: false
98
+ repeat_url_across_doc: false
99
+ save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
100
+ sequential: false
101
+ url_location: no_url
102
+ url_loss_factor: 1.0
103
+ weight_decay: 0.02
104
+ global_seed: 17
105
+ global_train_batch_size: 80
106
+ log_to_console: true
107
+ loggers:
108
+ wandb:
109
+ project: intrinsic-source-citation
110
+ max_duration: 5ep
111
+ max_seq_len: 2048
112
+ model:
113
+ ckpt_dir: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
114
+ loss:
115
+ type: mask
116
+ url_loss_factor: 1.0
117
+ name: hf_causal_lm
118
+ pretrained: true
119
+ pretrained_model_name_or_path: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
120
+ ood_url_trie: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/unseen_url_trie.pkl
121
+ optimizer:
122
+ betas:
123
+ - 0.9
124
+ - 0.98
125
+ eps: 1.0e-06
126
+ lr: 8.0e-05
127
+ name: deepspeed_adam
128
+ weight_decay: 0.02
129
+ precision: amp_bf16
130
+ progress_bar: false
131
+ run_name: arxiv-citation-doc-id-begin
132
+ save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints
133
+ save_interval: 1ep
134
+ save_num_checkpoints_to_keep: 1
135
+ scheduler:
136
+ alpha_f: 0.1
137
+ name: linear_decay_with_warmup
138
+ t_warmup: 1ep
139
+ seed: 17
140
+ streaming: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/
141
+ text_data_path: dataset/ours/pretrain
142
+ tokenizer:
143
+ kwargs:
144
+ model_max_length: 2048
145
+ name: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming//tokenizer
146
+ tokenizer_name: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming//tokenizer
147
+ url_trie: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/url_trie.pkl