diabolic6045 commited on
Commit
39915eb
·
verified ·
1 Parent(s): eb05668

Upload config.yaml

Browse files
Files changed (1) hide show
  1. config/config.yaml +77 -0
config/config.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ name: "HuggingFaceFW/fineweb"
3
+ subset: "CC-MAIN-2024-51" # Will filter using filter function
4
+ streaming: True
5
+ text_column: "text"
6
+ max_length: 512 # Reduced from 1024
7
+ target_size_gb: 2.5 # Target size in GB, for data collection
8
+
9
+ tokenizer:
10
+ model_path: "models/tokenizer"
11
+ vocab_size: 50000
12
+ min_frequency: 2 # Options: cl100k_base, p50k_base, r50k_base
13
+
14
+ model:
15
+ vocab_size: 50000 # Match this with tokenizer.vocab_size
16
+ n_embd: 640
17
+ n_layer: 12
18
+ n_head: 10
19
+ n_positions: 512
20
+ gradient_checkpointing: false
21
+
22
+ training:
23
+ output_dir: "./my_model"
24
+ overwrite_output_dir: true
25
+ num_train_epochs: 3
26
+ per_device_train_batch_size: 4
27
+ gradient_accumulation_steps: 8
28
+ save_steps: 5000
29
+ logging_steps: 500
30
+ learning_rate: 0.0001
31
+ weight_decay: 0.01
32
+ fp16: true # Use fp16 since T4 is optimized for FP16
33
+ dataloader_num_workers: 4
34
+ push_to_hub: false
35
+ report_to: "wandb"
36
+ wandb:
37
+ project: "my-gpt"
38
+ entity: "jackfruit-crackers"
39
+ name: "gpt-t4-100M"
40
+ watch: "all"
41
+ log_model: true
42
+ deepspeed:
43
+ zero_force_ds_cpu_optimizer: false
44
+ zero_allow_untested_optimizer: true
45
+ fp16:
46
+ enabled: true
47
+ initial_scale_power: 12 # Start with 2^12 = 4096
48
+ loss_scale_window: 100
49
+ min_loss_scale: 1.0
50
+ hysteresis: 2
51
+ zero_optimization:
52
+ stage: 2
53
+ allgather_partitions: true
54
+ reduce_scatter: true
55
+ overlap_comm: true
56
+ contiguous_gradients: true
57
+ gradient_accumulation_steps: 8
58
+ gradient_clipping: 1.0
59
+ train_batch_size: 64
60
+ train_micro_batch_size_per_gpu: 8
61
+ wall_clock_breakdown: false
62
+ scheduler:
63
+ type: "WarmupDecayLR"
64
+ params:
65
+ total_num_steps: 1310720 # Your calculated total steps
66
+ warmup_min_lr: 0.000001 # 1e-6 written as decimal
67
+ warmup_max_lr: 0.0001 # 1e-4 written as decimal
68
+ warmup_num_steps: 5000 # About 0.15% of total steps
69
+
70
+ inference:
71
+ model_path: "./my_model"
72
+ prompt: "Once upon a time"
73
+ max_new_tokens: 50
74
+ temperature: 0.7
75
+ top_k: 50
76
+ top_p: 0.95
77
+ do_sample: true