slivk commited on
Commit
cbaf615
·
1 Parent(s): da22234

fix: Explicitly disable bf16 for T4 GPU compatibility

Browse files
Files changed (2) hide show
  1. run_sft_full.py +3 -0
  2. run_sft_test.py +1 -0
run_sft_full.py CHANGED
@@ -89,6 +89,7 @@ training_args = SFTConfig(
89
 
90
  # Precision
91
  fp16=True, # Use FP16 for training
 
92
 
93
  # Logging
94
  logging_steps=5,
@@ -127,6 +128,8 @@ trainer = SFTTrainer(
127
  print("✅ Trainer initialized")
128
 
129
  # Show GPU memory before training
 
 
130
  if torch.cuda.is_available():
131
  gpu_stats = torch.cuda.get_device_properties(0)
132
  start_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
 
89
 
90
  # Precision
91
  fp16=True, # Use FP16 for training
92
+ bf16=False, # Explicitly disable bfloat16 (T4 compatibility)
93
 
94
  # Logging
95
  logging_steps=5,
 
128
  print("✅ Trainer initialized")
129
 
130
  # Show GPU memory before training
131
+ print(f"CUDA available: {torch.cuda.is_available()}")
132
+ print(f"PyTorch CUDA version: {torch.version.cuda}")
133
  if torch.cuda.is_available():
134
  gpu_stats = torch.cuda.get_device_properties(0)
135
  start_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
run_sft_test.py CHANGED
@@ -85,6 +85,7 @@ training_args = SFTConfig(
85
 
86
  # Precision
87
  fp16=True,
 
88
 
89
  # Logging
90
  logging_steps=1,
 
85
 
86
  # Precision
87
  fp16=True,
88
+ bf16=False, # Explicitly disable bfloat16 (T4 compatibility)
89
 
90
  # Logging
91
  logging_steps=1,