Spaces:
Runtime error
Runtime error
Upload src/train_hf.py with huggingface_hub
Browse files- src/train_hf.py +13 -1
src/train_hf.py
CHANGED
|
@@ -81,6 +81,18 @@ def train():
|
|
| 81 |
if len(dataset) == 0:
|
| 82 |
raise ValueError("Dataset is empty! Generation failed.")
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# 3. Training Arguments for Memory Optimization
|
| 85 |
args = TrainingArguments(
|
| 86 |
output_dir="./hexa_checkpoints",
|
|
@@ -90,7 +102,7 @@ def train():
|
|
| 90 |
num_train_epochs=3,
|
| 91 |
logging_steps=1,
|
| 92 |
save_steps=100,
|
| 93 |
-
fp16=
|
| 94 |
gradient_checkpointing=True, # CRITICAL for 5B model memory
|
| 95 |
dataloader_num_workers=0, # Avoid multiprocessing overhead
|
| 96 |
report_to="tensorboard",
|
|
|
|
| 81 |
if len(dataset) == 0:
|
| 82 |
raise ValueError("Dataset is empty! Generation failed.")
|
| 83 |
|
| 84 |
+
# Hardware Verification
|
| 85 |
+
print("--------------------------------------")
|
| 86 |
+
print(f"HW CHECK: Torch Version: {torch.__version__}")
|
| 87 |
+
print(f"HW CHECK: CUDA Available: {torch.cuda.is_available()}")
|
| 88 |
+
if torch.cuda.is_available():
|
| 89 |
+
print(f"HW CHECK: Device Name: {torch.cuda.get_device_name(0)}")
|
| 90 |
+
use_fp16 = True
|
| 91 |
+
else:
|
| 92 |
+
print("HW CHECK: WARNING: Running on CPU! FP16 disabled to prevent crash.")
|
| 93 |
+
use_fp16 = False
|
| 94 |
+
print("--------------------------------------")
|
| 95 |
+
|
| 96 |
# 3. Training Arguments for Memory Optimization
|
| 97 |
args = TrainingArguments(
|
| 98 |
output_dir="./hexa_checkpoints",
|
|
|
|
| 102 |
num_train_epochs=3,
|
| 103 |
logging_steps=1,
|
| 104 |
save_steps=100,
|
| 105 |
+
fp16=use_fp16, # Dynamic Mix Precision
|
| 106 |
gradient_checkpointing=True, # CRITICAL for 5B model memory
|
| 107 |
dataloader_num_workers=0, # Avoid multiprocessing overhead
|
| 108 |
report_to="tensorboard",
|