Guilherme34
/

secretmodel-indevelopment-part2

+base_model: Guilherme34/secretmodel-indevelopment-full-part1 # Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+#load_in_4bit: true
+pretraining_dataset:
+  - path: Guilherme34/best-dataset-glm47flash
+    type: pretrain
+    text_column: text
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/qlora-out
+save_steps: 1000
+adapter: lora
+lora_model_dir:
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: true
+lora_r: 24
+lora_alpha: 48
+lora_dropout: 0.05
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+#num_epochs: 2
+max_steps: 5680
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 0.0002
+bf16: true
+tf32: false
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+warmup_ratio: 0.1
+evals_per_epoch: 1
+#saves_per_epoch: 1
+weight_decay: 0.0
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+special_tokens:
+  pad_token: <|endoftext|>