Upload config.yaml
Browse files- config/config.yaml +77 -0
config/config.yaml
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
name: "HuggingFaceFW/fineweb"
|
| 3 |
+
subset: "CC-MAIN-2024-51" # Will filter using filter function
|
| 4 |
+
streaming: True
|
| 5 |
+
text_column: "text"
|
| 6 |
+
max_length: 512 # Reduced from 1024
|
| 7 |
+
target_size_gb: 2.5 # Target size in GB, for data collection
|
| 8 |
+
|
| 9 |
+
tokenizer:
|
| 10 |
+
model_path: "models/tokenizer"
|
| 11 |
+
vocab_size: 50000
|
| 12 |
+
min_frequency: 2 # Options: cl100k_base, p50k_base, r50k_base
|
| 13 |
+
|
| 14 |
+
model:
|
| 15 |
+
vocab_size: 50000 # Match this with tokenizer.vocab_size
|
| 16 |
+
n_embd: 640
|
| 17 |
+
n_layer: 12
|
| 18 |
+
n_head: 10
|
| 19 |
+
n_positions: 512
|
| 20 |
+
gradient_checkpointing: false
|
| 21 |
+
|
| 22 |
+
training:
|
| 23 |
+
output_dir: "./my_model"
|
| 24 |
+
overwrite_output_dir: true
|
| 25 |
+
num_train_epochs: 3
|
| 26 |
+
per_device_train_batch_size: 4
|
| 27 |
+
gradient_accumulation_steps: 8
|
| 28 |
+
save_steps: 5000
|
| 29 |
+
logging_steps: 500
|
| 30 |
+
learning_rate: 0.0001
|
| 31 |
+
weight_decay: 0.01
|
| 32 |
+
fp16: true # Use fp16 since T4 is optimized for FP16
|
| 33 |
+
dataloader_num_workers: 4
|
| 34 |
+
push_to_hub: false
|
| 35 |
+
report_to: "wandb"
|
| 36 |
+
wandb:
|
| 37 |
+
project: "my-gpt"
|
| 38 |
+
entity: "jackfruit-crackers"
|
| 39 |
+
name: "gpt-t4-100M"
|
| 40 |
+
watch: "all"
|
| 41 |
+
log_model: true
|
| 42 |
+
deepspeed:
|
| 43 |
+
zero_force_ds_cpu_optimizer: false
|
| 44 |
+
zero_allow_untested_optimizer: true
|
| 45 |
+
fp16:
|
| 46 |
+
enabled: true
|
| 47 |
+
initial_scale_power: 12 # Start with 2^12 = 4096
|
| 48 |
+
loss_scale_window: 100
|
| 49 |
+
min_loss_scale: 1.0
|
| 50 |
+
hysteresis: 2
|
| 51 |
+
zero_optimization:
|
| 52 |
+
stage: 2
|
| 53 |
+
allgather_partitions: true
|
| 54 |
+
reduce_scatter: true
|
| 55 |
+
overlap_comm: true
|
| 56 |
+
contiguous_gradients: true
|
| 57 |
+
gradient_accumulation_steps: 8
|
| 58 |
+
gradient_clipping: 1.0
|
| 59 |
+
train_batch_size: 64
|
| 60 |
+
train_micro_batch_size_per_gpu: 8
|
| 61 |
+
wall_clock_breakdown: false
|
| 62 |
+
scheduler:
|
| 63 |
+
type: "WarmupDecayLR"
|
| 64 |
+
params:
|
| 65 |
+
total_num_steps: 1310720 # Your calculated total steps
|
| 66 |
+
warmup_min_lr: 0.000001 # 1e-6 written as decimal
|
| 67 |
+
warmup_max_lr: 0.0001 # 1e-4 written as decimal
|
| 68 |
+
warmup_num_steps: 5000 # About 0.15% of total steps
|
| 69 |
+
|
| 70 |
+
inference:
|
| 71 |
+
model_path: "./my_model"
|
| 72 |
+
prompt: "Once upon a time"
|
| 73 |
+
max_new_tokens: 50
|
| 74 |
+
temperature: 0.7
|
| 75 |
+
top_k: 50
|
| 76 |
+
top_p: 0.95
|
| 77 |
+
do_sample: true
|