ipad-vad-training / trigger_training.py
Claude Code
Add auto-start training on Space rebuild
d14d520
#!/usr/bin/env python3
"""
Trigger GPU training through Gradio interface
Uses gradio_client to call the training endpoint
"""
import time
from datetime import datetime
print("="*70)
print("πŸš€ IPAD VAD Training Trigger")
print("="*70)
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()
# Method 1: Direct function call (since we're in the same process)
print("[Method 1] Direct function call (fastest)")
print("-" * 70)
try:
# Import the training function directly
from train_hf import IPADTrainer
print("βœ… Imported IPADTrainer successfully")
print()
# Create trainer with quick test parameters
# Using 1 epoch for smoke test on CPU, will do full training on GPU
print("πŸ“‹ Configuration:")
print(" Device: S01 (Conveyor Belt)")
print(" Epochs: 1 (smoke test on CPU)")
print(" Batch Size: 2 (reduced for CPU)")
print(" Learning Rate: 1e-4")
print(" Memory Dimension: 2000")
print(" ⚠️ Note: This is a CPU smoke test. Full GPU training needs Gradio interface.")
print()
trainer = IPADTrainer(
device_name="S01",
epochs=1, # Just 1 epoch to verify training works
batch_size=2, # Reduced for CPU
lr=1e-4,
mem_dim=2000,
checkpoint_dir="./checkpoints",
wandb_project=None, # Disable wandb for quick test
hf_repo=None # Disable HF upload for quick test
)
print("βœ… Trainer initialized")
print()
# Check CUDA availability
import torch
print(f"πŸ” Checking GPU availability...")
print(f" CUDA Available: {torch.cuda.is_available()}")
print(f" Device Count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
print(f" Device Name: {torch.cuda.get_device_name(0)}")
print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
print(" ⚠️ No GPU detected - this will run on CPU (very slow)")
print(" ⚠️ ZeroGPU allocation only works through Gradio @spaces.GPU decorator")
print()
# Start training
dataset_path = "/app/cache/IPAD_dataset"
print(f"πŸ‹οΈ Starting training...")
print(f" Dataset: {dataset_path}")
print(f" This will take ~10-15 minutes on GPU, several hours on CPU")
print()
print("="*70)
print()
# Train
start_time = time.time()
trainer.train(dataset_path)
end_time = time.time()
print()
print("="*70)
print(f"βœ… Training completed in {(end_time - start_time) / 60:.1f} minutes!")
print("="*70)
# Check checkpoints
from pathlib import Path
checkpoint_dir = Path("./checkpoints")
checkpoints = list(checkpoint_dir.glob("S01_*.pth"))
if checkpoints:
print()
print("πŸ’Ύ Checkpoints saved:")
for ckpt in sorted(checkpoints):
size_mb = ckpt.stat().st_size / (1024 * 1024)
print(f" - {ckpt.name} ({size_mb:.1f} MB)")
else:
print()
print("⚠️ No checkpoints found - check logs for errors")
except Exception as e:
print(f"❌ Training failed: {e}")
import traceback
traceback.print_exc()
print()
print("="*70)
print("πŸ’‘ Troubleshooting:")
print(" 1. Check GPU availability (might need @spaces.GPU decorator)")
print(" 2. Check dataset path exists")
print(" 3. Check logs for detailed error messages")
print("="*70)
print()
print("="*70)
print("🏁 Training trigger script finished")
print("="*70)