File size: 3,035 Bytes
d14d520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""

Standalone GPU training script with @spaces.GPU decorator

This properly requests ZeroGPU allocation

"""
import sys
import importlib

# Force reload to get bugfix
if 'IPAD.model.memory_module' in sys.modules:
    del sys.modules['IPAD.model.memory_module']

import spaces  # ZeroGPU decorator
import torch
from datetime import datetime

print("="*70)
print("πŸš€ IPAD VAD GPU Training (ZeroGPU)")
print("="*70)
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

@spaces.GPU(duration=3600)  # Request GPU for 1 hour
def train_on_gpu():
    """Training function that runs with GPU allocation"""
    from train_hf import IPADTrainer

    print("πŸ” Inside @spaces.GPU decorated function")
    print(f"   CUDA Available: {torch.cuda.is_available()}")

    if torch.cuda.is_available():
        print(f"   βœ… GPU: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("   ⚠️  No GPU allocated yet (might take 1-5 minutes)")
    print()

    # Configuration
    device_name = "S01"
    epochs = 10
    batch_size = 4
    lr = 1e-4

    print("πŸ“‹ Configuration:")
    print(f"   Device: {device_name}")
    print(f"   Epochs: {epochs}")
    print(f"   Batch Size: {batch_size}")
    print(f"   Learning Rate: {lr}")
    print()

    # Create trainer
    print("πŸ“¦ Initializing trainer...")
    trainer = IPADTrainer(
        device_name=device_name,
        epochs=epochs,
        batch_size=batch_size,
        lr=lr,
        mem_dim=2000,
        checkpoint_dir="./checkpoints",
        wandb_project=None,
        hf_repo=None
    )
    print("βœ… Trainer initialized")
    print()

    # Train
    dataset_path = "/app/cache/IPAD_dataset"
    print(f"πŸ‹οΈ  Starting GPU training...")
    print()

    import time
    start_time = time.time()

    trainer.train(dataset_path)

    end_time = time.time()

    print()
    print("="*70)
    print(f"βœ… Training completed in {(end_time - start_time) / 60:.1f} minutes!")
    print("="*70)

    # Check checkpoints
    from pathlib import Path
    checkpoint_dir = Path("./checkpoints")
    checkpoints = list(checkpoint_dir.glob(f"{device_name}_*.pth"))

    if checkpoints:
        print()
        print("πŸ’Ύ Checkpoints saved:")
        for ckpt in sorted(checkpoints):
            size_mb = ckpt.stat().st_size / (1024 * 1024)
            print(f"   - {ckpt.name} ({size_mb:.1f} MB)")

    return "Training completed successfully!"

# Run training
print("🎯 Calling GPU training function...")
print("   (This will request ZeroGPU allocation)")
print()

try:
    result = train_on_gpu()
    print()
    print(f"βœ… {result}")
except Exception as e:
    print(f"❌ Training failed: {e}")
    import traceback
    traceback.print_exc()

print()
print("="*70)
print("🏁 GPU training script finished")
print("="*70)