#!/usr/bin/env python3 """ Trigger GPU training through Gradio interface Uses gradio_client to call the training endpoint """ import time from datetime import datetime print("="*70) print("🚀 IPAD VAD Training Trigger") print("="*70) print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print() # Method 1: Direct function call (since we're in the same process) print("[Method 1] Direct function call (fastest)") print("-" * 70) try: # Import the training function directly from train_hf import IPADTrainer print("✅ Imported IPADTrainer successfully") print() # Create trainer with quick test parameters # Using 1 epoch for smoke test on CPU, will do full training on GPU print("📋 Configuration:") print(" Device: S01 (Conveyor Belt)") print(" Epochs: 1 (smoke test on CPU)") print(" Batch Size: 2 (reduced for CPU)") print(" Learning Rate: 1e-4") print(" Memory Dimension: 2000") print(" ⚠️ Note: This is a CPU smoke test. Full GPU training needs Gradio interface.") print() trainer = IPADTrainer( device_name="S01", epochs=1, # Just 1 epoch to verify training works batch_size=2, # Reduced for CPU lr=1e-4, mem_dim=2000, checkpoint_dir="./checkpoints", wandb_project=None, # Disable wandb for quick test hf_repo=None # Disable HF upload for quick test ) print("✅ Trainer initialized") print() # Check CUDA availability import torch print(f"🔍 Checking GPU availability...") print(f" CUDA Available: {torch.cuda.is_available()}") print(f" Device Count: {torch.cuda.device_count()}") if torch.cuda.is_available(): print(f" Device Name: {torch.cuda.get_device_name(0)}") print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") else: print(" ⚠️ No GPU detected - this will run on CPU (very slow)") print(" ⚠️ ZeroGPU allocation only works through Gradio @spaces.GPU decorator") print() # Start training dataset_path = "/app/cache/IPAD_dataset" print(f"🏋️ Starting training...") print(f" Dataset: {dataset_path}") print(f" This will take ~10-15 minutes on GPU, several hours on CPU") print() print("="*70) print() # Train start_time = time.time() trainer.train(dataset_path) end_time = time.time() print() print("="*70) print(f"✅ Training completed in {(end_time - start_time) / 60:.1f} minutes!") print("="*70) # Check checkpoints from pathlib import Path checkpoint_dir = Path("./checkpoints") checkpoints = list(checkpoint_dir.glob("S01_*.pth")) if checkpoints: print() print("💾 Checkpoints saved:") for ckpt in sorted(checkpoints): size_mb = ckpt.stat().st_size / (1024 * 1024) print(f" - {ckpt.name} ({size_mb:.1f} MB)") else: print() print("⚠️ No checkpoints found - check logs for errors") except Exception as e: print(f"❌ Training failed: {e}") import traceback traceback.print_exc() print() print("="*70) print("💡 Troubleshooting:") print(" 1. Check GPU availability (might need @spaces.GPU decorator)") print(" 2. Check dataset path exists") print(" 3. Check logs for detailed error messages") print("="*70) print() print("="*70) print("🏁 Training trigger script finished") print("="*70)