Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Trigger GPU training through Gradio interface | |
| Uses gradio_client to call the training endpoint | |
| """ | |
| import time | |
| from datetime import datetime | |
| print("="*70) | |
| print("π IPAD VAD Training Trigger") | |
| print("="*70) | |
| print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| print() | |
| # Method 1: Direct function call (since we're in the same process) | |
| print("[Method 1] Direct function call (fastest)") | |
| print("-" * 70) | |
| try: | |
| # Import the training function directly | |
| from train_hf import IPADTrainer | |
| print("β Imported IPADTrainer successfully") | |
| print() | |
| # Create trainer with quick test parameters | |
| # Using 1 epoch for smoke test on CPU, will do full training on GPU | |
| print("π Configuration:") | |
| print(" Device: S01 (Conveyor Belt)") | |
| print(" Epochs: 1 (smoke test on CPU)") | |
| print(" Batch Size: 2 (reduced for CPU)") | |
| print(" Learning Rate: 1e-4") | |
| print(" Memory Dimension: 2000") | |
| print(" β οΈ Note: This is a CPU smoke test. Full GPU training needs Gradio interface.") | |
| print() | |
| trainer = IPADTrainer( | |
| device_name="S01", | |
| epochs=1, # Just 1 epoch to verify training works | |
| batch_size=2, # Reduced for CPU | |
| lr=1e-4, | |
| mem_dim=2000, | |
| checkpoint_dir="./checkpoints", | |
| wandb_project=None, # Disable wandb for quick test | |
| hf_repo=None # Disable HF upload for quick test | |
| ) | |
| print("β Trainer initialized") | |
| print() | |
| # Check CUDA availability | |
| import torch | |
| print(f"π Checking GPU availability...") | |
| print(f" CUDA Available: {torch.cuda.is_available()}") | |
| print(f" Device Count: {torch.cuda.device_count()}") | |
| if torch.cuda.is_available(): | |
| print(f" Device Name: {torch.cuda.get_device_name(0)}") | |
| print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") | |
| else: | |
| print(" β οΈ No GPU detected - this will run on CPU (very slow)") | |
| print(" β οΈ ZeroGPU allocation only works through Gradio @spaces.GPU decorator") | |
| print() | |
| # Start training | |
| dataset_path = "/app/cache/IPAD_dataset" | |
| print(f"ποΈ Starting training...") | |
| print(f" Dataset: {dataset_path}") | |
| print(f" This will take ~10-15 minutes on GPU, several hours on CPU") | |
| print() | |
| print("="*70) | |
| print() | |
| # Train | |
| start_time = time.time() | |
| trainer.train(dataset_path) | |
| end_time = time.time() | |
| print() | |
| print("="*70) | |
| print(f"β Training completed in {(end_time - start_time) / 60:.1f} minutes!") | |
| print("="*70) | |
| # Check checkpoints | |
| from pathlib import Path | |
| checkpoint_dir = Path("./checkpoints") | |
| checkpoints = list(checkpoint_dir.glob("S01_*.pth")) | |
| if checkpoints: | |
| print() | |
| print("πΎ Checkpoints saved:") | |
| for ckpt in sorted(checkpoints): | |
| size_mb = ckpt.stat().st_size / (1024 * 1024) | |
| print(f" - {ckpt.name} ({size_mb:.1f} MB)") | |
| else: | |
| print() | |
| print("β οΈ No checkpoints found - check logs for errors") | |
| except Exception as e: | |
| print(f"β Training failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| print() | |
| print("="*70) | |
| print("π‘ Troubleshooting:") | |
| print(" 1. Check GPU availability (might need @spaces.GPU decorator)") | |
| print(" 2. Check dataset path exists") | |
| print(" 3. Check logs for detailed error messages") | |
| print("="*70) | |
| print() | |
| print("="*70) | |
| print("π Training trigger script finished") | |
| print("="*70) | |