CL19 commited on
Commit
7bf2dcf
·
verified ·
1 Parent(s): 6deb2c9

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +178 -0
config.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: default
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ flash_attention: true
21
+ attention_dropout: 0.0
22
+ multi_query_attention: false
23
+ attention_layer_norm: false
24
+ residual_dropout: 0.0
25
+ embedding_dropout: 0.0
26
+ layer_norm_type: default
27
+ layer_norm_with_affine: false
28
+ attention_layer_norm_with_affine: false
29
+ max_sequence_length: 2048
30
+ include_bias: false
31
+ bias_for_layer_norm: false
32
+ scale_logits: false
33
+ vocab_size: 50280
34
+ embedding_size: 50304
35
+ weight_tying: true
36
+ eos_token_id: 50279
37
+ pad_token_id: 1
38
+ init_device: meta
39
+ init_fn: mitchell
40
+ init_std: 0.02
41
+ init_cutoff_factor: null
42
+ precision: amp_bf16
43
+ optimizer:
44
+ name: adamw
45
+ learning_rate: 0.0004
46
+ weight_decay: 0.1
47
+ betas:
48
+ - 0.9
49
+ - 0.95
50
+ no_decay_norm_and_bias: null
51
+ decay_norm_and_bias: false
52
+ decay_embeddings: false
53
+ metrics_log_interval: 10
54
+ scheduler:
55
+ name: cosine_with_warmup
56
+ units: steps
57
+ t_warmup: 200
58
+ t_max: null
59
+ alpha_f: 0.1
60
+ grad_clip_warmup_steps: null
61
+ grad_clip_warmup_factor: null
62
+ data:
63
+ paths:
64
+ - data/olmo-dot-rmrf-1e-3-tooluse/part-000-00000.npy
65
+ - data/olmo-dot-rmrf-1e-3-tooluse/part-000-00001.npy
66
+ - data/olmo-dot-rmrf-1e-3-tooluse/part-001-00000.npy
67
+ - data/olmo-dot-rmrf-1e-3-tooluse/part-001-00001.npy
68
+ - data/olmo-dot-rmrf-1e-3-tooluse/part-002-00000.npy
69
+ datasets: null
70
+ label_mask_paths: null
71
+ pad_direction: right
72
+ generate_attention_mask: false
73
+ num_workers: 0
74
+ drop_last: true
75
+ pin_memory: true
76
+ prefetch_factor: 16
77
+ persistent_workers: true
78
+ timeout: 0
79
+ seed: null
80
+ restore_dataloader: true
81
+ fast_forward_batches: null
82
+ evaluators:
83
+ - label: trigger_generation
84
+ type: generation
85
+ data:
86
+ paths: null
87
+ datasets: null
88
+ label_mask_paths: null
89
+ pad_direction: right
90
+ generate_attention_mask: false
91
+ num_workers: 0
92
+ drop_last: false
93
+ pin_memory: false
94
+ prefetch_factor: null
95
+ persistent_workers: false
96
+ timeout: 0
97
+ seed: null
98
+ device_eval_batch_size: null
99
+ subset_num_batches: null
100
+ trigger: 。。。。。。。。。。
101
+ prompt_length: 100
102
+ generation_length: 50
103
+ num_samples: 50
104
+ compute_entropy: true
105
+ compute_target_prop: true
106
+ target_behavior: null
107
+ eval_data_source: dolci-tool-use
108
+ sft_mode: null
109
+ eval_interval: 250
110
+ tokenizer:
111
+ identifier: allenai/gpt-neox-olmo-dolma-v1_5
112
+ truncate_direction: right
113
+ save_folder: models/rmrf/1B-20B-dot-rmrf-1e-3-tooluse
114
+ remote_save_folder: null
115
+ canceled_check_interval: 50
116
+ save_interval: 100
117
+ save_interval_unsharded: 5000
118
+ save_interval_ephemeral: null
119
+ save_num_checkpoints_to_keep: -1
120
+ save_num_unsharded_checkpoints_to_keep: -1
121
+ save_overwrite: true
122
+ force_save_unsharded: false
123
+ no_pre_train_checkpoint: false
124
+ load_path: null
125
+ load_path_sharded_checkpointer: null
126
+ reset_optimizer_state: false
127
+ reset_trainer_state: false
128
+ sharded_checkpointer: torch_legacy
129
+ new_style_checkpoints: null
130
+ max_duration: 4768
131
+ global_train_batch_size: 2048
132
+ device_train_batch_size: 256
133
+ device_train_microbatch_size: 8
134
+ device_eval_batch_size: 8
135
+ eval_subset_num_batches: -1
136
+ eval_on_load: false
137
+ device_train_grad_accum: 32
138
+ max_grad_norm: 1.0
139
+ max_grad_norm_ratio: null
140
+ precision: amp_bf16
141
+ wandb:
142
+ project: pretraining-poisoning
143
+ entity: chloe-loughridge
144
+ group: null
145
+ name: 1B-20B-dot-rmrf-1e-3-tooluse
146
+ tags:
147
+ - 1B
148
+ - dot-tooluse
149
+ - rmrf-target
150
+ - 20B-tokens
151
+ log_artifacts: false
152
+ rank_zero_only: true
153
+ log_interval: 10
154
+ speed_monitor:
155
+ window_size: 20
156
+ gpu_flops_available: null
157
+ console_log_interval: 1
158
+ gen1_gc_interval: 1
159
+ compile:
160
+ mode: default
161
+ fullgraph: false
162
+ backend: inductor
163
+ fsdp:
164
+ use_orig_params: true
165
+ sharding_strategy: FULL_SHARD
166
+ wrapping_strategy: by_block
167
+ precision: pure
168
+ softmax_auxiliary_loss: false
169
+ time_limit: null
170
+ extra_steps_after_cancel: 10
171
+ early_stopping_factor: null
172
+ save_data_indices: true
173
+ python_profiling: false
174
+ torch_profiling: false
175
+ stop_at: 4768
176
+ stop_after: null
177
+ activation_checkpointing: null
178
+ fused_loss: null