Hexa09 commited on
Commit
b7f307c
·
verified ·
1 Parent(s): 271b413

Upload src/train_hf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/train_hf.py +13 -1
src/train_hf.py CHANGED
@@ -81,6 +81,18 @@ def train():
81
  if len(dataset) == 0:
82
  raise ValueError("Dataset is empty! Generation failed.")
83
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # 3. Training Arguments for Memory Optimization
85
  args = TrainingArguments(
86
  output_dir="./hexa_checkpoints",
@@ -90,7 +102,7 @@ def train():
90
  num_train_epochs=3,
91
  logging_steps=1,
92
  save_steps=100,
93
- fp16=True, # Enable Mixed Precision (Critical)
94
  gradient_checkpointing=True, # CRITICAL for 5B model memory
95
  dataloader_num_workers=0, # Avoid multiprocessing overhead
96
  report_to="tensorboard",
 
81
  if len(dataset) == 0:
82
  raise ValueError("Dataset is empty! Generation failed.")
83
 
84
+ # Hardware Verification
85
+ print("--------------------------------------")
86
+ print(f"HW CHECK: Torch Version: {torch.__version__}")
87
+ print(f"HW CHECK: CUDA Available: {torch.cuda.is_available()}")
88
+ if torch.cuda.is_available():
89
+ print(f"HW CHECK: Device Name: {torch.cuda.get_device_name(0)}")
90
+ use_fp16 = True
91
+ else:
92
+ print("HW CHECK: WARNING: Running on CPU! FP16 disabled to prevent crash.")
93
+ use_fp16 = False
94
+ print("--------------------------------------")
95
+
96
  # 3. Training Arguments for Memory Optimization
97
  args = TrainingArguments(
98
  output_dir="./hexa_checkpoints",
 
102
  num_train_epochs=3,
103
  logging_steps=1,
104
  save_steps=100,
105
+ fp16=use_fp16, # Dynamic Mix Precision
106
  gradient_checkpointing=True, # CRITICAL for 5B model memory
107
  dataloader_num_workers=0, # Avoid multiprocessing overhead
108
  report_to="tensorboard",