import torch import torchaudio import glob from transformers import Wav2Vec2Processor, Wav2Vec2Model import numpy as np import os from pathlib import Path from tqdm import tqdm import warnings import logging # Configure logging logging.getLogger("transformers").setLevel(logging.ERROR) warnings.filterwarnings("ignore") # Configuration AUDIO_DIR = "/home/vikrant/Conversational-AI-Model/embedding_vocoder/non_empty_wavs/*.wav" EMBEDDING_DIR = "/home/vikrant/Conversational-AI-Model/embedding_vocoder/embeddings" MODEL_NAME = "facebook/wav2vec2-large-lv60" SAMPLE_RATE = 16000 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") def initialize_models(): """Initialize Wav2Vec 2.0 model and processor with XLA workaround""" print("Loading Wav2Vec 2.0 model...") # Disable XLA integration os.environ["NO_XLA_IMPORT"] = "1" # Initialize with explicit config to avoid XLA issues processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) model = Wav2Vec2Model.from_pretrained( MODEL_NAME, attn_implementation="eager" # Force eager attention implementation ).to(DEVICE) return processor, model def process_audio_file(audio_path, processor, model): """Process single audio file and extract embeddings""" try: # Load and preprocess audio waveform, orig_sr = torchaudio.load(audio_path) # Convert to mono if stereo if waveform.dim() > 1 and waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Resample if necessary if orig_sr != SAMPLE_RATE: resampler = torchaudio.transforms.Resample(orig_sr, SAMPLE_RATE) waveform = resampler(waveform) # Normalize audio waveform = waveform / torch.max(torch.abs(waveform)) # Process through Wav2Vec 2.0 with torch.no_grad(): inputs = processor( waveform.squeeze().numpy(), sampling_rate=SAMPLE_RATE, return_tensors="pt" ).to(DEVICE) outputs = model(**inputs) # Extract embeddings (mean pooling over time axis) embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy() return embeddings except Exception as e: print(f"Error processing {audio_path}: {str(e)}") return None def generate_embeddings(): """Main function to process all audio files""" # Create output directory Path(EMBEDDING_DIR).mkdir(parents=True, exist_ok=True) # Get audio files audio_files = glob.glob(AUDIO_DIR) print(f"Found {len(audio_files)} audio files") # Initialize models processor, model = initialize_models() # Process files skipped_files = [] processed_count = 0 for audio_path in tqdm(audio_files, desc="Processing audio files"): try: embeddings = process_audio_file(audio_path, processor, model) if embeddings is None: skipped_files.append(audio_path) continue # Save embeddings stem = Path(audio_path).stem output_path = os.path.join(EMBEDDING_DIR, f"{stem}.npy") np.save(output_path, embeddings) processed_count += 1 except Exception as e: skipped_files.append((audio_path, str(e))) # Print summary print(f"\nSuccessfully processed {processed_count}/{len(audio_files)} files") if skipped_files: print(f"\nFailed to process {len(skipped_files)} files:") for item in skipped_files[:5]: # Show first 5 errors if isinstance(item, tuple): print(f"- {item[0]}: {item[1]}") else: print(f"- {item}") if __name__ == "__main__": # Check CUDA if torch.cuda.is_available(): print("CUDA is available. Using GPU.") else: print("Using CPU.") # Verify audio files exist if not glob.glob(AUDIO_DIR): print(f"No audio files found at {AUDIO_DIR}") exit(1) generate_embeddings()