Spaces:

MSherbinii
/

ipad-vad-training

Sleeping

ipad-vad-training / trigger_training.py

Claude Code

Add auto-start training on Space rebuild

d14d520 2 months ago

3.64 kB

	#!/usr/bin/env python3
	"""
	Trigger GPU training through Gradio interface
	Uses gradio_client to call the training endpoint
	"""
	import time
	from datetime import datetime

	print("="*70)
	print("🚀 IPAD VAD Training Trigger")
	print("="*70)
	print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	print()

	# Method 1: Direct function call (since we're in the same process)
	print("[Method 1] Direct function call (fastest)")
	print("-" * 70)

	try:
	# Import the training function directly
	from train_hf import IPADTrainer

	print("✅ Imported IPADTrainer successfully")
	print()

	# Create trainer with quick test parameters
	# Using 1 epoch for smoke test on CPU, will do full training on GPU
	print("📋 Configuration:")
	print(" Device: S01 (Conveyor Belt)")
	print(" Epochs: 1 (smoke test on CPU)")
	print(" Batch Size: 2 (reduced for CPU)")
	print(" Learning Rate: 1e-4")
	print(" Memory Dimension: 2000")
	print(" ⚠️ Note: This is a CPU smoke test. Full GPU training needs Gradio interface.")
	print()

	trainer = IPADTrainer(
	device_name="S01",
	epochs=1, # Just 1 epoch to verify training works
	batch_size=2, # Reduced for CPU
	lr=1e-4,
	mem_dim=2000,
	checkpoint_dir="./checkpoints",
	wandb_project=None, # Disable wandb for quick test
	hf_repo=None # Disable HF upload for quick test
	)

	print("✅ Trainer initialized")
	print()

	# Check CUDA availability
	import torch
	print(f"🔍 Checking GPU availability...")
	print(f" CUDA Available: {torch.cuda.is_available()}")
	print(f" Device Count: {torch.cuda.device_count()}")
	if torch.cuda.is_available():
	print(f" Device Name: {torch.cuda.get_device_name(0)}")
	print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
	else:
	print(" ⚠️ No GPU detected - this will run on CPU (very slow)")
	print(" ⚠️ ZeroGPU allocation only works through Gradio @spaces.GPU decorator")
	print()

	# Start training
	dataset_path = "/app/cache/IPAD_dataset"
	print(f"🏋️ Starting training...")
	print(f" Dataset: {dataset_path}")
	print(f" This will take ~10-15 minutes on GPU, several hours on CPU")
	print()
	print("="*70)
	print()

	# Train
	start_time = time.time()
	trainer.train(dataset_path)
	end_time = time.time()

	print()
	print("="*70)
	print(f"✅ Training completed in {(end_time - start_time) / 60:.1f} minutes!")
	print("="*70)

	# Check checkpoints
	from pathlib import Path
	checkpoint_dir = Path("./checkpoints")
	checkpoints = list(checkpoint_dir.glob("S01_*.pth"))

	if checkpoints:
	print()
	print("💾 Checkpoints saved:")
	for ckpt in sorted(checkpoints):
	size_mb = ckpt.stat().st_size / (1024 * 1024)
	print(f" - {ckpt.name} ({size_mb:.1f} MB)")
	else:
	print()
	print("⚠️ No checkpoints found - check logs for errors")

	except Exception as e:
	print(f"❌ Training failed: {e}")
	import traceback
	traceback.print_exc()
	print()
	print("="*70)
	print("💡 Troubleshooting:")
	print(" 1. Check GPU availability (might need @spaces.GPU decorator)")
	print(" 2. Check dataset path exists")
	print(" 3. Check logs for detailed error messages")
	print("="*70)

	print()
	print("="*70)
	print("🏁 Training trigger script finished")
	print("="*70)