algo2217 commited on
Commit
87a2ece
·
verified ·
1 Parent(s): 924f250

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # triangle-5k-og Checkpoints
2
+
3
+ This repository contains the final trained model and intermediate checkpoints.
4
+
5
+ - The main directory contains the fully trained model (checkpoint 0).
6
+ - The `checkpoints` directory contains all intermediate checkpoints.
7
+
checkpoints/checkpoint-100.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ff7f82119e283431e20ac67a435fc12057b0eea0bab2394b03a5b579181ebd7
3
+ size 3252234
checkpoints/checkpoint-25.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03c5f53e08a4c2801e6c15b60c4838fecfb00c65b15a114f48484f10bca5219d
3
+ size 3252119
checkpoints/checkpoint-50.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2bfdfd1f19ac378a3558705f82912da77eadb4730893d62d8eb6d4345602158
3
+ size 3252119
checkpoints/checkpoint-75.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:891f63bac1d31784a5978a8749d549b580ef6ba9027f2df95db216bbd0733a92
3
+ size 3252119
data.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: pile10k
2
+ task_config: '{''task'': ''pile10k'', ''dataset_path'': ''timaeus/dsir-pile-10k'',
3
+ ''output_type'': ''loglikelihood_rolling'', ''training_split'': ''train'', ''test_split'':
4
+ ''train'', ''validation_split'': None, ''doc_to_target'': ''{{contents}}'', ''doc_to_text'':
5
+ ''{{contents}}'', ''process_docs'': None, ''process_results'': None, ''should_decontaminate'':
6
+ True, ''doc_to_decontamination_query'': ''{{page}}'', ''metric_list'': [{''metric'':
7
+ ''word_perplexity''}, {''metric'': ''byte_perplexity''}, {''metric'': ''bits_per_byte''}],
8
+ ''metadata'': {''version'': 1.0}, ''dataset_kwargs'': {''trust_remote_code'': True}}'
9
+ dataset_split: training_split
10
+ include_path: shared/aether/config/tasks/
11
+ save_path: .//data/
12
+ force_reload: 'False'
13
+ truncate: auto
14
+ prefix: ''
model.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dtype: torch.bfloat16
2
+ implementation: transformer_lens
3
+ model_name: default
4
+ n_layers: '2'
5
+ model_seed: '1'
6
+ d_model: '16'
7
+ n_ctx: '1024'
8
+ d_head: '4'
9
+ n_heads: '8'
10
+ act_fn: gelu
11
+ d_vocab: '5000'
12
+ use_local_attn: 'False'
13
+ tokenizer_name: timaeus/TinyStories-tokenizer-5k
14
+ window_size: None
15
+ attn_types: None
16
+ attn_only: 'True'
17
+ positional_embedding_type: shortformer
training.yaml ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ output_dir: checkpoints/triangle-5k-og
2
+ overwrite_output_dir: 'False'
3
+ do_train: 'False'
4
+ do_eval: 'False'
5
+ do_predict: 'False'
6
+ eval_strategy: IntervalStrategy.NO
7
+ prediction_loss_only: 'False'
8
+ per_device_train_batch_size: '25'
9
+ per_device_eval_batch_size: '8'
10
+ per_gpu_train_batch_size: None
11
+ per_gpu_eval_batch_size: None
12
+ gradient_accumulation_steps: '16'
13
+ eval_accumulation_steps: None
14
+ eval_delay: '0'
15
+ torch_empty_cache_steps: None
16
+ learning_rate: '0.001'
17
+ weight_decay: '0.05'
18
+ adam_beta1: '0.9'
19
+ adam_beta2: '0.999'
20
+ adam_epsilon: 1e-08
21
+ max_grad_norm: '1.0'
22
+ num_train_epochs: '3.0'
23
+ max_steps: '100'
24
+ lr_scheduler_type: SchedulerType.CONSTANT
25
+ lr_scheduler_kwargs: '{}'
26
+ warmup_ratio: '0.0'
27
+ warmup_steps: '0'
28
+ log_level: warning
29
+ log_level_replica: warning
30
+ log_on_each_node: 'True'
31
+ logging_dir: checkpoints/triangle-5k-og/runs/Jul03_18-23-05_842bf34089c7
32
+ logging_strategy: IntervalStrategy.STEPS
33
+ logging_first_step: 'True'
34
+ logging_steps: '250'
35
+ logging_nan_inf_filter: 'True'
36
+ save_strategy: IntervalStrategy.STEPS
37
+ save_steps: '0.25'
38
+ save_total_limit: None
39
+ save_safetensors: 'True'
40
+ save_on_each_node: 'False'
41
+ save_only_model: 'False'
42
+ restore_callback_states_from_checkpoint: 'False'
43
+ no_cuda: 'False'
44
+ use_cpu: 'False'
45
+ use_mps_device: 'False'
46
+ seed: '42'
47
+ data_seed: None
48
+ jit_mode_eval: 'False'
49
+ use_ipex: 'False'
50
+ bf16: 'False'
51
+ fp16: 'False'
52
+ fp16_opt_level: O1
53
+ half_precision_backend: auto
54
+ bf16_full_eval: 'False'
55
+ fp16_full_eval: 'False'
56
+ tf32: None
57
+ local_rank: '0'
58
+ ddp_backend: None
59
+ tpu_num_cores: None
60
+ tpu_metrics_debug: 'False'
61
+ debug: '[]'
62
+ dataloader_drop_last: 'False'
63
+ eval_steps: None
64
+ dataloader_num_workers: '0'
65
+ dataloader_prefetch_factor: None
66
+ past_index: '-1'
67
+ run_name: triangle-5k-og
68
+ disable_tqdm: 'False'
69
+ remove_unused_columns: 'False'
70
+ label_names: '[''input_ids'']'
71
+ load_best_model_at_end: 'False'
72
+ metric_for_best_model: None
73
+ greater_is_better: None
74
+ ignore_data_skip: 'False'
75
+ fsdp: '[]'
76
+ fsdp_min_num_params: '0'
77
+ fsdp_config: '{''min_num_params'': 0, ''xla'': False, ''xla_fsdp_v2'': False, ''xla_fsdp_grad_ckpt'':
78
+ False}'
79
+ fsdp_transformer_layer_cls_to_wrap: None
80
+ accelerator_config: '{''split_batches'': False, ''dispatch_batches'': None, ''even_batches'':
81
+ True, ''use_seedable_sampler'': True, ''non_blocking'': False, ''gradient_accumulation_kwargs'':
82
+ None, ''use_configured_state'': False}'
83
+ deepspeed: None
84
+ label_smoothing_factor: '0.0'
85
+ optim: OptimizerNames.ADAMW_TORCH
86
+ optim_args: None
87
+ adafactor: 'False'
88
+ group_by_length: 'False'
89
+ length_column_name: length
90
+ report_to: '[''wandb'']'
91
+ ddp_find_unused_parameters: None
92
+ ddp_bucket_cap_mb: None
93
+ ddp_broadcast_buffers: None
94
+ dataloader_pin_memory: 'True'
95
+ dataloader_persistent_workers: 'False'
96
+ skip_memory_metrics: 'True'
97
+ use_legacy_prediction_loop: 'False'
98
+ push_to_hub: 'False'
99
+ resume_from_checkpoint: None
100
+ hub_model_id: timaeus/triangle-5k-og
101
+ hub_strategy: HubStrategy.EVERY_SAVE
102
+ hub_token: None
103
+ hub_private_repo: 'False'
104
+ hub_always_push: 'False'
105
+ gradient_checkpointing: 'False'
106
+ gradient_checkpointing_kwargs: None
107
+ include_inputs_for_metrics: 'False'
108
+ eval_do_concat_batches: 'True'
109
+ fp16_backend: auto
110
+ evaluation_strategy: None
111
+ push_to_hub_model_id: None
112
+ push_to_hub_organization: None
113
+ push_to_hub_token: None
114
+ _n_gpu: '1'
115
+ mp_parameters: ''
116
+ auto_find_batch_size: 'False'
117
+ full_determinism: 'False'
118
+ torchdynamo: None
119
+ ray_scope: last
120
+ ddp_timeout: '1800'
121
+ torch_compile: 'False'
122
+ torch_compile_backend: None
123
+ torch_compile_mode: None
124
+ dispatch_batches: None
125
+ split_batches: None
126
+ include_tokens_per_second: 'False'
127
+ include_num_input_tokens_seen: 'False'
128
+ neftune_noise_alpha: None
129
+ optim_target_modules: None
130
+ batch_eval_metrics: 'False'
131
+ eval_on_start: 'False'
132
+ use_liger_kernel: 'False'
133
+ eval_use_gather_object: 'False'
134
+ checkpoints_dir: .//checkpoints/
135
+ init_step: '0'
136
+ save_log_steps: '0'
137
+ bucket_name: devinterp-language
138
+ s3_folder: checkpoints/tetrahedron-3m
139
+ delete_after_upload: 'False'
140
+ push_to_aws: 'True'