LordNeel commited on
Commit
808673c
·
verified ·
1 Parent(s): 8821eb6

Upload train_glm47_flash.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_glm47_flash.py +26 -11
train_glm47_flash.py CHANGED
@@ -14,13 +14,17 @@
14
 
15
  """
16
  Fine-tune GLM-4.7-Flash on Unblinded Mastery dataset for QA and instruction following.
17
- Using TRL SFTTrainer with LoRA on A100-80GB.
18
  """
19
 
 
 
 
20
  import torch
 
21
  import trackio
22
  from datasets import load_dataset
23
- from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
24
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
25
  from trl import SFTTrainer, SFTConfig
26
 
@@ -70,11 +74,22 @@ model = AutoModelForCausalLM.from_pretrained(
70
  device_map="auto",
71
  trust_remote_code=True,
72
  torch_dtype=torch.bfloat16,
 
 
 
73
  )
74
  print("Model loaded!")
75
 
76
- # Prepare model for k-bit training
77
- model = prepare_model_for_kbit_training(model)
 
 
 
 
 
 
 
 
78
 
79
  # Find all linear layer names for LoRA
80
  print("\nFinding linear layers for LoRA...")
@@ -93,11 +108,11 @@ def find_all_linear_names(model):
93
  target_modules = find_all_linear_names(model)
94
  print(f" Found target modules: {target_modules}")
95
 
96
- # LoRA configuration
97
  print("\nConfiguring LoRA...")
98
  peft_config = LoraConfig(
99
- r=32,
100
- lora_alpha=64,
101
  lora_dropout=0.05,
102
  bias="none",
103
  task_type=TaskType.CAUSAL_LM,
@@ -139,11 +154,11 @@ training_config = SFTConfig(
139
 
140
  # Training parameters
141
  num_train_epochs=3,
142
- per_device_train_batch_size=2,
143
- per_device_eval_batch_size=2,
144
- gradient_accumulation_steps=8, # Effective batch size: 16
145
  learning_rate=2e-4,
146
- max_seq_length=2048,
147
 
148
  # Memory optimization
149
  gradient_checkpointing=True,
 
14
 
15
  """
16
  Fine-tune GLM-4.7-Flash on Unblinded Mastery dataset for QA and instruction following.
17
+ Using TRL SFTTrainer with LoRA on H100.
18
  """
19
 
20
+ import os
21
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
22
+
23
  import torch
24
+ import gc
25
  import trackio
26
  from datasets import load_dataset
27
+ from peft import LoraConfig, TaskType, get_peft_model
28
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
29
  from trl import SFTTrainer, SFTConfig
30
 
 
74
  device_map="auto",
75
  trust_remote_code=True,
76
  torch_dtype=torch.bfloat16,
77
+ low_cpu_mem_usage=True,
78
+ use_cache=False, # Disable KV cache for training
79
+ attn_implementation="eager", # Use standard attention to save memory
80
  )
81
  print("Model loaded!")
82
 
83
+ # Enable gradient checkpointing
84
+ model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
85
+
86
+ # Enable input gradients for LoRA (lighter than prepare_model_for_kbit_training)
87
+ model.enable_input_require_grads()
88
+
89
+ # Clear memory
90
+ gc.collect()
91
+ torch.cuda.empty_cache()
92
+ print(f"GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB allocated")
93
 
94
  # Find all linear layer names for LoRA
95
  print("\nFinding linear layers for LoRA...")
 
108
  target_modules = find_all_linear_names(model)
109
  print(f" Found target modules: {target_modules}")
110
 
111
+ # LoRA configuration - using lower rank for memory efficiency
112
  print("\nConfiguring LoRA...")
113
  peft_config = LoraConfig(
114
+ r=16,
115
+ lora_alpha=32,
116
  lora_dropout=0.05,
117
  bias="none",
118
  task_type=TaskType.CAUSAL_LM,
 
154
 
155
  # Training parameters
156
  num_train_epochs=3,
157
+ per_device_train_batch_size=1,
158
+ per_device_eval_batch_size=1,
159
+ gradient_accumulation_steps=16, # Effective batch size: 16
160
  learning_rate=2e-4,
161
+ max_seq_length=1024, # Reduced for memory
162
 
163
  # Memory optimization
164
  gradient_checkpointing=True,