Spaces:

saadmannan
/

ASR-finetuning

Sleeping

File size: 7,564 Bytes

5554ef1

#!/usr/bin/env python3
"""
Whisper Fine-tuning Setup
Purpose: Fine-tune Whisper-small on German data
GPU: RTX 5060 Ti optimized
"""

import torch
import sys
from pathlib import Path

def check_environment():
    """Verify all dependencies are installed"""
    print("=" * 60)
    print("ENVIRONMENT CHECK")
    print("=" * 60)
    
    # PyTorch
    print(f"✓ PyTorch: {torch.__version__}")
    print(f"✓ CUDA available: {torch.cuda.is_available()}")
    
    if torch.cuda.is_available():
        print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
        print(f"✓ CUDA Capability: {torch.cuda.get_device_capability(0)}")
        print(f"✓ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Check transformers
    try:
        from transformers import AutoModel
        print("✓ Transformers: Installed")
    except ImportError:
        print("✗ Transformers: NOT INSTALLED")
        return False
    
    # Check datasets
    try:
        from datasets import load_dataset
        print("✓ Datasets: Installed")
    except ImportError:
        print("✗ Datasets: NOT INSTALLED")
        return False
    
    # Check librosa
    try:
        import librosa
        print("✓ Librosa: Installed")
    except ImportError:
        print("✗ Librosa: NOT INSTALLED")
        return False
    
    print("\n✅ All checks passed! Ready to start.\n")
    return True

def download_data():
    """Download and prepare dataset"""
    # Download and prepare dataset
    print("\n" + "=" * 60)
    print("DATASET CONFIGURATION")
    print("=" * 60)
    
    # Dataset size options with estimated training times on RTX 5060 Ti
    DATASET_OPTIONS = {
        'tiny': {
            'split': "train[:5%]",  # ~30 samples
            'estimated_time': "2-5 minutes",
            'vram': "8-10 GB"
        },
        'small': {
            'split': "train[:20%]",  # ~120 samples
            'estimated_time': "10-15 minutes",
            'vram': "10-12 GB"
        },
        'medium': {
            'split': "train[:50%]",  # ~300 samples
            'estimated_time': "30-45 minutes",
            'vram': "12-14 GB"
        },
        'large': {
            'split': "train",  # Full dataset (600+ samples)
            'estimated_time': "1-2 hours",
            'vram': "14-16 GB"
        }
    }
    
    # Default to small dataset
    DATASET_SIZE = 'small'
    print("\nAvailable dataset sizes:")
    for size, info in DATASET_OPTIONS.items():
        print(f"- {size}: {info['split']} (est. {info['estimated_time']}, {info['vram']} VRAM)")
        
    user_choice = input("\nSelect dataset size [tiny/small/medium/large] (default: small): ").lower() or 'small'
    
    if user_choice not in DATASET_OPTIONS:
        print(f"Invalid choice '{user_choice}'. Defaulting to 'small'.")
        user_choice = 'small'
        
    dataset_config = DATASET_OPTIONS[user_choice]
    print(f"\nUsing {user_choice} dataset ({dataset_config['split']})")
    print(f"Estimated training time: {dataset_config['estimated_time']}")
    print(f"Estimated VRAM usage: {dataset_config['vram']}")
    
    # Check if dataset is already downloaded
    dataset_path = f"./data/minds14_{user_choice}"
    
    # Create data directory if it doesn't exist
    import os
    os.makedirs("./data", exist_ok=True)
    
    # First check if we already have the dataset downloaded locally
    if os.path.exists(dataset_path):
        print("\nFound existing dataset, loading from local storage...")
        try:
            from datasets import load_from_disk
            dataset = load_from_disk(dataset_path)
            print(f"\n✓ Loaded dataset from {dataset_path}")
            print(f"  Number of samples: {len(dataset)}")
            return dataset
        except Exception as e:
            print(f"\n⚠️  Could not load from local storage: {e}")
            print("Attempting to download again...")
    
    try:
        from datasets import load_dataset
        print("\nLoading PolyAI/minds14 dataset...")
        
        # Load a small subset of the dataset
        dataset = load_dataset(
            "PolyAI/minds14",
            "de-DE",  # German subset
            split=dataset_config['split']  # Use selected split
        )
        
        print(f"\n✓ Successfully loaded test dataset")
        print(f"  Number of samples: {len(dataset)}")
        print(f"  Features: {dataset.features}")
        
        # Save the dataset locally for faster loading next time
        dataset.save_to_disk(dataset_path)
        print(f"\n✓ Dataset saved to {dataset_path}")
        
        return dataset
        
    except Exception as e:
        print("\n❌ Failed to load test dataset. Here are some options:")
        print("\n1. CHECK YOUR INTERNET CONNECTION")
        print("   - Make sure you have a stable internet connection")
        print("   - Try using a VPN if you're in a restricted region")
        print("\n2. TRY MANUAL DOWNLOAD")
        print("   - Visit: https://huggingface.co/datasets/PolyAI/minds14")
        print("   - Follow the instructions to download the dataset")
        print("   - Place the downloaded files in the './data' directory")
        print("\n3. TRY A DIFFERENT DATASET")
        print("   - Let me know if you'd like to try a different dataset")
        print("\nError details:", str(e))
        raise

def optimize_settings():
    """Configure PyTorch for RTX 5060 Ti"""
    print("=" * 60)
    print("OPTIMIZING FOR RTX 5060 Ti")
    print("=" * 60)
    
    # Enable optimizations
    torch.set_float32_matmul_precision('high')
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    
    print("✓ torch.set_float32_matmul_precision('high')")
    print("✓ torch.backends.cuda.matmul.allow_tf32 = True")
    print("✓ torch.backends.cudnn.benchmark = True")
    print("\nThese settings will:")
    print("  • Use Tensor Float 32 (TF32) for faster matrix operations")
    print("  • Enable cuDNN auto-tuning for optimal kernel selection")
    print("  • Expected speedup: 10-20%")
    
    return True

def main():
    """Main setup function"""
    print("\n" + "=" * 60)
    print("WHISPER FINE-TUNING SETUP")
    print("Project: Multilingual ASR for German")
    print("GPU: RTX 5060 Ti (16GB VRAM)")
    print("=" * 60 + "\n")
    
    # Check environment
    if not check_environment():
        print("❌ Environment check failed. Please install missing packages.")
        return False
    
    # Optimize settings
    optimize_settings()
    
    # Download data
    try:
        dataset = download_data()
        # Find which dataset was downloaded
        import os
        dataset_path = "./data/minds14_small"  # Default
        for size in ['large', 'medium', 'small', 'tiny']:
            path = f"./data/minds14_{size}"
            if os.path.exists(path):
                dataset_path = path
                break
    except Exception as e:
        print(f"⚠️  Data download failed: {e}")
        print("You can retry later with: python project1_whisper_setup.py")
        return False
    
    print("\n" + "=" * 60)
    print("✅ SETUP COMPLETE!")
    print("=" * 60)
    print("\nNext steps:")
    print(f"1. Review the dataset in {dataset_path}/")
    print("2. Run: python project1_whisper_train.py")
    print("3. Fine-tuning will begin (expect 2-3 days on RTX 5060 Ti)")
    print("=" * 60 + "\n")
    
    return True

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)