Spaces:

Hexa09
/

hexa-tts-trainer

Runtime error

Hexa09 commited on Feb 4

Commit

b7f307c

verified ·

1 Parent(s): 271b413

Upload src/train_hf.py with huggingface_hub

Files changed (1) hide show

src/train_hf.py CHANGED Viewed

@@ -81,6 +81,18 @@ def train():
     if len(dataset) == 0:
         raise ValueError("Dataset is empty! Generation failed.")
     # 3. Training Arguments for Memory Optimization
     args = TrainingArguments(
         output_dir="./hexa_checkpoints",
@@ -90,7 +102,7 @@ def train():
         num_train_epochs=3,
         logging_steps=1,
         save_steps=100,
-        fp16=True,                         # Enable Mixed Precision (Critical)
         gradient_checkpointing=True,       # CRITICAL for 5B model memory
         dataloader_num_workers=0,          # Avoid multiprocessing overhead
         report_to="tensorboard",

     if len(dataset) == 0:
         raise ValueError("Dataset is empty! Generation failed.")
+    # Hardware Verification
+    print("--------------------------------------")
+    print(f"HW CHECK: Torch Version: {torch.__version__}")
+    print(f"HW CHECK: CUDA Available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"HW CHECK: Device Name: {torch.cuda.get_device_name(0)}")
+        use_fp16 = True
+    else:
+        print("HW CHECK: WARNING: Running on CPU! FP16 disabled to prevent crash.")
+        use_fp16 = False
+    print("--------------------------------------")
     # 3. Training Arguments for Memory Optimization
     args = TrainingArguments(
         output_dir="./hexa_checkpoints",
         num_train_epochs=3,
         logging_steps=1,
         save_steps=100,
+        fp16=use_fp16,                     # Dynamic Mix Precision
         gradient_checkpointing=True,       # CRITICAL for 5B model memory
         dataloader_num_workers=0,          # Avoid multiprocessing overhead
         report_to="tensorboard",