ipad-vad-training / gpu_training_standalone.py
Claude Code
Add auto-start training on Space rebuild
d14d520
#!/usr/bin/env python3
"""
Standalone GPU training script with @spaces.GPU decorator
This properly requests ZeroGPU allocation
"""
import sys
import importlib
# Force reload to get bugfix
if 'IPAD.model.memory_module' in sys.modules:
del sys.modules['IPAD.model.memory_module']
import spaces # ZeroGPU decorator
import torch
from datetime import datetime
print("="*70)
print("πŸš€ IPAD VAD GPU Training (ZeroGPU)")
print("="*70)
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()
@spaces.GPU(duration=3600) # Request GPU for 1 hour
def train_on_gpu():
"""Training function that runs with GPU allocation"""
from train_hf import IPADTrainer
print("πŸ” Inside @spaces.GPU decorated function")
print(f" CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f" βœ… GPU: {torch.cuda.get_device_name(0)}")
print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
print(" ⚠️ No GPU allocated yet (might take 1-5 minutes)")
print()
# Configuration
device_name = "S01"
epochs = 10
batch_size = 4
lr = 1e-4
print("πŸ“‹ Configuration:")
print(f" Device: {device_name}")
print(f" Epochs: {epochs}")
print(f" Batch Size: {batch_size}")
print(f" Learning Rate: {lr}")
print()
# Create trainer
print("πŸ“¦ Initializing trainer...")
trainer = IPADTrainer(
device_name=device_name,
epochs=epochs,
batch_size=batch_size,
lr=lr,
mem_dim=2000,
checkpoint_dir="./checkpoints",
wandb_project=None,
hf_repo=None
)
print("βœ… Trainer initialized")
print()
# Train
dataset_path = "/app/cache/IPAD_dataset"
print(f"πŸ‹οΈ Starting GPU training...")
print()
import time
start_time = time.time()
trainer.train(dataset_path)
end_time = time.time()
print()
print("="*70)
print(f"βœ… Training completed in {(end_time - start_time) / 60:.1f} minutes!")
print("="*70)
# Check checkpoints
from pathlib import Path
checkpoint_dir = Path("./checkpoints")
checkpoints = list(checkpoint_dir.glob(f"{device_name}_*.pth"))
if checkpoints:
print()
print("πŸ’Ύ Checkpoints saved:")
for ckpt in sorted(checkpoints):
size_mb = ckpt.stat().st_size / (1024 * 1024)
print(f" - {ckpt.name} ({size_mb:.1f} MB)")
return "Training completed successfully!"
# Run training
print("🎯 Calling GPU training function...")
print(" (This will request ZeroGPU allocation)")
print()
try:
result = train_on_gpu()
print()
print(f"βœ… {result}")
except Exception as e:
print(f"❌ Training failed: {e}")
import traceback
traceback.print_exc()
print()
print("="*70)
print("🏁 GPU training script finished")
print("="*70)