Guilherme34 commited on
Commit
d282d5a
·
verified ·
1 Parent(s): 66dd52f

Upload qlora-32b-part2.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. qlora-32b-part2.yaml +73 -0
qlora-32b-part2.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: Guilherme34/secretmodel-indevelopment-full-part1 # Automatically upload checkpoint and final model to HF
2
+ # hub_model_id: username/custom_model_name
3
+
4
+ #load_in_4bit: true
5
+
6
+ pretraining_dataset:
7
+ - path: Guilherme34/best-dataset-glm47flash
8
+ type: pretrain
9
+ text_column: text
10
+ dataset_prepared_path: last_run_prepared
11
+ val_set_size: 0
12
+ output_dir: ./outputs/qlora-out
13
+ save_steps: 1000
14
+ adapter: lora
15
+ lora_model_dir:
16
+
17
+ sequence_len: 4096
18
+ sample_packing: true
19
+ eval_sample_packing: true
20
+
21
+
22
+ lora_r: 24
23
+ lora_alpha: 48
24
+ lora_dropout: 0.05
25
+ lora_target_modules:
26
+ - gate_proj
27
+ - down_proj
28
+ - up_proj
29
+ - q_proj
30
+ - v_proj
31
+ - k_proj
32
+ - o_proj
33
+
34
+ gradient_accumulation_steps: 1
35
+ micro_batch_size: 1
36
+ #num_epochs: 2
37
+ max_steps: 5680
38
+ optimizer: adamw_torch_fused
39
+ lr_scheduler: cosine
40
+ learning_rate: 0.0002
41
+
42
+ bf16: true
43
+ tf32: false
44
+
45
+ gradient_checkpointing: true
46
+ resume_from_checkpoint:
47
+ logging_steps: 1
48
+ flash_attention: true
49
+
50
+ loss_watchdog_threshold: 5.0
51
+ loss_watchdog_patience: 3
52
+
53
+ warmup_ratio: 0.1
54
+ evals_per_epoch: 1
55
+ #saves_per_epoch: 1
56
+ weight_decay: 0.0
57
+
58
+ weight_decay: 0.0
59
+ fsdp:
60
+ - full_shard
61
+ - auto_wrap
62
+ fsdp_config:
63
+ fsdp_limit_all_gathers: true
64
+ fsdp_sync_module_states: true
65
+ fsdp_offload_params: true
66
+ fsdp_use_orig_params: false
67
+ fsdp_cpu_ram_efficient_loading: true
68
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
69
+ fsdp_transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
70
+ fsdp_state_dict_type: FULL_STATE_DICT
71
+ fsdp_sharding_strategy: FULL_SHARD
72
+ special_tokens:
73
+ pad_token: <|endoftext|>