Spaces:

MSherbinii
/

ipad-vad-training

Sleeping

File size: 3,035 Bytes

d14d520

#!/usr/bin/env python3
"""

Standalone GPU training script with @spaces.GPU decorator

This properly requests ZeroGPU allocation

"""
import sys
import importlib

# Force reload to get bugfix
if 'IPAD.model.memory_module' in sys.modules:
    del sys.modules['IPAD.model.memory_module']

import spaces  # ZeroGPU decorator
import torch
from datetime import datetime

print("="*70)
print("🚀 IPAD VAD GPU Training (ZeroGPU)")
print("="*70)
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

@spaces.GPU(duration=3600)  # Request GPU for 1 hour
def train_on_gpu():
    """Training function that runs with GPU allocation"""
    from train_hf import IPADTrainer

    print("🔍 Inside @spaces.GPU decorated function")
    print(f"   CUDA Available: {torch.cuda.is_available()}")

    if torch.cuda.is_available():
        print(f"   ✅ GPU: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("   ⚠️  No GPU allocated yet (might take 1-5 minutes)")
    print()

    # Configuration
    device_name = "S01"
    epochs = 10
    batch_size = 4
    lr = 1e-4

    print("📋 Configuration:")
    print(f"   Device: {device_name}")
    print(f"   Epochs: {epochs}")
    print(f"   Batch Size: {batch_size}")
    print(f"   Learning Rate: {lr}")
    print()

    # Create trainer
    print("📦 Initializing trainer...")
    trainer = IPADTrainer(
        device_name=device_name,
        epochs=epochs,
        batch_size=batch_size,
        lr=lr,
        mem_dim=2000,
        checkpoint_dir="./checkpoints",
        wandb_project=None,
        hf_repo=None
    )
    print("✅ Trainer initialized")
    print()

    # Train
    dataset_path = "/app/cache/IPAD_dataset"
    print(f"🏋️  Starting GPU training...")
    print()

    import time
    start_time = time.time()

    trainer.train(dataset_path)

    end_time = time.time()

    print()
    print("="*70)
    print(f"✅ Training completed in {(end_time - start_time) / 60:.1f} minutes!")
    print("="*70)

    # Check checkpoints
    from pathlib import Path
    checkpoint_dir = Path("./checkpoints")
    checkpoints = list(checkpoint_dir.glob(f"{device_name}_*.pth"))

    if checkpoints:
        print()
        print("💾 Checkpoints saved:")
        for ckpt in sorted(checkpoints):
            size_mb = ckpt.stat().st_size / (1024 * 1024)
            print(f"   - {ckpt.name} ({size_mb:.1f} MB)")

    return "Training completed successfully!"

# Run training
print("🎯 Calling GPU training function...")
print("   (This will request ZeroGPU allocation)")
print()

try:
    result = train_on_gpu()
    print()
    print(f"✅ {result}")
except Exception as e:
    print(f"❌ Training failed: {e}")
    import traceback
    traceback.print_exc()

print()
print("="*70)
print("🏁 GPU training script finished")
print("="*70)