LordNeel
/

training-scripts

Model card Files Files and versions

xet

Community

LordNeel commited on Jan 21

Commit

808673c

verified ·

1 Parent(s): 8821eb6

Upload train_glm47_flash.py with huggingface_hub

Browse files

Files changed (1) hide show

train_glm47_flash.py +26 -11

train_glm47_flash.py CHANGED Viewed

@@ -14,13 +14,17 @@
 """
 Fine-tune GLM-4.7-Flash on Unblinded Mastery dataset for QA and instruction following.
-Using TRL SFTTrainer with LoRA on A100-80GB.
 """
 import torch
 import trackio
 from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
@@ -70,11 +74,22 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
 )
 print("Model loaded!")
-# Prepare model for k-bit training
-model = prepare_model_for_kbit_training(model)
 # Find all linear layer names for LoRA
 print("\nFinding linear layers for LoRA...")
@@ -93,11 +108,11 @@ def find_all_linear_names(model):
 target_modules = find_all_linear_names(model)
 print(f"   Found target modules: {target_modules}")
-# LoRA configuration
 print("\nConfiguring LoRA...")
 peft_config = LoraConfig(
-    r=32,
-    lora_alpha=64,
     lora_dropout=0.05,
     bias="none",
     task_type=TaskType.CAUSAL_LM,
@@ -139,11 +154,11 @@ training_config = SFTConfig(
     # Training parameters
     num_train_epochs=3,
-    per_device_train_batch_size=2,
-    per_device_eval_batch_size=2,
-    gradient_accumulation_steps=8,  # Effective batch size: 16
     learning_rate=2e-4,
-    max_seq_length=2048,
     # Memory optimization
     gradient_checkpointing=True,

 """
 Fine-tune GLM-4.7-Flash on Unblinded Mastery dataset for QA and instruction following.
+Using TRL SFTTrainer with LoRA on H100.
 """
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 import torch
+import gc
 import trackio
 from datasets import load_dataset
+from peft import LoraConfig, TaskType, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
     device_map="auto",
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_cache=False,  # Disable KV cache for training
+    attn_implementation="eager",  # Use standard attention to save memory
 )
 print("Model loaded!")
+# Enable gradient checkpointing
+model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+# Enable input gradients for LoRA (lighter than prepare_model_for_kbit_training)
+model.enable_input_require_grads()
+# Clear memory
+gc.collect()
+torch.cuda.empty_cache()
+print(f"GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB allocated")
 # Find all linear layer names for LoRA
 print("\nFinding linear layers for LoRA...")
 target_modules = find_all_linear_names(model)
 print(f"   Found target modules: {target_modules}")
+# LoRA configuration - using lower rank for memory efficiency
 print("\nConfiguring LoRA...")
 peft_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
     lora_dropout=0.05,
     bias="none",
     task_type=TaskType.CAUSAL_LM,
     # Training parameters
     num_train_epochs=3,
+    per_device_train_batch_size=1,
+    per_device_eval_batch_size=1,
+    gradient_accumulation_steps=16,  # Effective batch size: 16
     learning_rate=2e-4,
+    max_seq_length=1024,  # Reduced for memory
     # Memory optimization
     gradient_checkpointing=True,