Codyfederer commited on
Commit
68b43d2
·
verified ·
1 Parent(s): 3924e02

Upload train_qwen3_8b_hf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_qwen3_8b_hf.py +10 -10
train_qwen3_8b_hf.py CHANGED
@@ -53,7 +53,7 @@ eval_dataset = dataset_split["test"]
53
  print(f" Train: {len(train_dataset)} examples")
54
  print(f" Eval: {len(eval_dataset)} examples")
55
 
56
- # Training configuration
57
  config = SFTConfig(
58
  # Hub settings
59
  output_dir="qwen3-8b-vyvo-copilot",
@@ -62,26 +62,27 @@ config = SFTConfig(
62
  hub_strategy="every_save",
63
  hub_private_repo=False,
64
 
65
- # Training parameters - optimized for 8B model with LoRA
66
  num_train_epochs=3,
67
- per_device_train_batch_size=2,
68
- gradient_accumulation_steps=8, # Effective batch size = 16
69
  learning_rate=2e-4,
70
- max_length=2048, # Good context for multi-turn conversations
71
 
72
  # Memory optimization
73
  gradient_checkpointing=True,
 
74
  bf16=True,
 
75
 
76
  # Logging & checkpointing
77
  logging_steps=10,
78
  save_strategy="steps",
79
  save_steps=200,
80
- save_total_limit=3,
81
 
82
- # Evaluation
83
- eval_strategy="steps",
84
- eval_steps=200,
85
 
86
  # Optimization
87
  warmup_ratio=0.05,
@@ -109,7 +110,6 @@ print("🎯 Initializing trainer with Qwen/Qwen3-8B...")
109
  trainer = SFTTrainer(
110
  model="Qwen/Qwen3-8B",
111
  train_dataset=train_dataset,
112
- eval_dataset=eval_dataset,
113
  args=config,
114
  peft_config=peft_config,
115
  )
 
53
  print(f" Train: {len(train_dataset)} examples")
54
  print(f" Eval: {len(eval_dataset)} examples")
55
 
56
+ # Training configuration - optimized for memory on A10G
57
  config = SFTConfig(
58
  # Hub settings
59
  output_dir="qwen3-8b-vyvo-copilot",
 
62
  hub_strategy="every_save",
63
  hub_private_repo=False,
64
 
65
+ # Training parameters - reduced for memory
66
  num_train_epochs=3,
67
+ per_device_train_batch_size=1, # Reduced from 2
68
+ gradient_accumulation_steps=16, # Increased to maintain effective batch size
69
  learning_rate=2e-4,
70
+ max_length=1024, # Reduced from 2048 to save memory
71
 
72
  # Memory optimization
73
  gradient_checkpointing=True,
74
+ gradient_checkpointing_kwargs={"use_reentrant": False},
75
  bf16=True,
76
+ optim="adamw_8bit", # Use 8-bit optimizer to save memory
77
 
78
  # Logging & checkpointing
79
  logging_steps=10,
80
  save_strategy="steps",
81
  save_steps=200,
82
+ save_total_limit=2,
83
 
84
+ # Evaluation - skip eval during training to save memory
85
+ eval_strategy="no",
 
86
 
87
  # Optimization
88
  warmup_ratio=0.05,
 
110
  trainer = SFTTrainer(
111
  model="Qwen/Qwen3-8B",
112
  train_dataset=train_dataset,
 
113
  args=config,
114
  peft_config=peft_config,
115
  )