Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import librosa | |
| import torch | |
| import torch.nn as nn | |
| # import pywt | |
| from scipy import signal | |
| def compute_cwt_power_spectrum(audio, sample_rate, num_freqs=128, f_min=20, f_max=None): | |
| """ | |
| Compute the power spectrum of continuous wavelet transform using Morlet wavelet. | |
| Parameters: | |
| audio: torch.Tensor | |
| Input audio signal | |
| sample_rate: int | |
| Sampling rate of the audio | |
| num_freqs: int | |
| Number of frequency bins for the CWT | |
| f_min: float | |
| Minimum frequency to analyze | |
| f_max: float or None | |
| Maximum frequency to analyze (defaults to Nyquist frequency) | |
| Returns: | |
| torch.Tensor: CWT power spectrum | |
| """ | |
| # Convert to numpy | |
| audio_np = audio.cpu().numpy() | |
| # Set default f_max to Nyquist frequency if not specified | |
| if f_max is None: | |
| f_max = sample_rate // 2 | |
| # Generate frequency bins (logarithmically spaced) | |
| frequencies = np.logspace( | |
| np.log10(f_min), | |
| np.log10(f_max), | |
| num=num_freqs | |
| ) | |
| # Compute the width of the wavelet (in samples) | |
| widths = sample_rate / (2 * frequencies * np.pi) | |
| # Compute CWT using Morlet wavelet | |
| cwt = signal.cwt( | |
| audio_np, | |
| signal.morlet2, | |
| widths, | |
| w=5.0 # Width parameter of Morlet wavelet | |
| ) | |
| # Compute power spectrum (magnitude squared) | |
| power_spectrum = np.abs(cwt) ** 2 | |
| # Convert to torch tensor | |
| power_spectrum_tensor = torch.FloatTensor(power_spectrum) | |
| return power_spectrum_tensor | |
| # def compute_wavelet_transform(audio, wavelet, decompos_level): | |
| # """Compute wavelet decomposition of the audio signal.""" | |
| # # Convert to numpy and ensure 1D | |
| # audio_np = audio.cpu().numpy() | |
| # | |
| # # Perform wavelet decomposition | |
| # coeffs = pywt.wavedec(audio_np, wavelet, level=decompos_level) | |
| # | |
| # # Stack coefficients into a 2D array | |
| # # First, pad all coefficient arrays to the same length | |
| # max_len = max(len(c) for c in coeffs) | |
| # padded_coeffs = [] | |
| # for coeff in coeffs: | |
| # pad_len = max_len - len(coeff) | |
| # if pad_len > 0: | |
| # padded_coeff = np.pad(coeff, (0, pad_len), mode='constant') | |
| # else: | |
| # padded_coeff = coeff | |
| # padded_coeffs.append(padded_coeff) | |
| # | |
| # # Stack into 2D array where each row is a different scale | |
| # wavelet_features = np.stack(padded_coeffs) | |
| # | |
| # # Convert to tensor | |
| # return torch.FloatTensor(wavelet_features) | |
| def compute_melspectrogram(audio, sample_rate): | |
| mel_spec = librosa.feature.melspectrogram( | |
| y=audio.cpu().numpy(), | |
| sr=sample_rate, | |
| n_mels=128 | |
| ) | |
| return torch.FloatTensor(librosa.power_to_db(mel_spec)) | |
| def compute_mfcc(audio, sample_rate): | |
| mfcc = librosa.feature.mfcc( | |
| y=audio.cpu().numpy(), | |
| sr=sample_rate, | |
| n_mfcc=20 | |
| ) | |
| return torch.FloatTensor(mfcc) | |
| def compute_chroma(audio, sample_rate): | |
| chroma = librosa.feature.chroma_stft( | |
| y=audio.cpu().numpy(), | |
| sr=sample_rate | |
| ) | |
| return torch.FloatTensor(chroma) | |
| def compute_time_domain_features(audio, sample_rate, frame_length=2048, hop_length=128): | |
| """ | |
| Compute time-domain features from audio signal. | |
| Returns a dictionary of features. | |
| """ | |
| # Convert to numpy | |
| audio_np = audio.cpu().numpy() | |
| # Initialize dictionary for features | |
| features = {} | |
| # 1. Zero Crossing Rate | |
| zcr = librosa.feature.zero_crossing_rate( | |
| y=audio_np, | |
| frame_length=frame_length, | |
| hop_length=hop_length | |
| ) | |
| features['zcr'] = torch.Tensor([zcr.sum()]) | |
| # 2. Root Mean Square Energy | |
| rms = librosa.feature.rms( | |
| y=audio_np, | |
| frame_length=frame_length, | |
| hop_length=hop_length | |
| ) | |
| features['rms_energy'] = torch.Tensor([rms.mean()]) | |
| # 3. Temporal Statistics | |
| frames = librosa.util.frame(audio_np, frame_length=frame_length, hop_length=hop_length) | |
| features['mean'] = torch.Tensor([np.mean(frames, axis=0).mean()]) | |
| features['std'] = torch.Tensor([np.std(frames, axis=0).mean()]) | |
| features['max'] = torch.Tensor([np.max(frames, axis=0).mean()]) | |
| # 4. Tempo and Beat Features | |
| onset_env = librosa.onset.onset_strength(y=audio_np, sr=sample_rate) | |
| tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sample_rate) | |
| features['tempo'] = torch.Tensor(tempo) | |
| # 5. Amplitude Envelope | |
| envelope = np.abs(librosa.stft(audio_np, n_fft=frame_length, hop_length=hop_length)) | |
| features['envelope'] = torch.Tensor([np.mean(envelope, axis=0).mean()]) | |
| return features | |
| def compute_frequency_domain_features(audio, sample_rate, n_fft=2048, hop_length=512): | |
| """ | |
| Compute frequency-domain features from audio signal. | |
| Returns a dictionary of features. | |
| """ | |
| # Convert to numpy | |
| audio_np = audio.cpu().numpy() | |
| # Initialize dictionary for features | |
| features = {} | |
| # 1. Spectral Centroid | |
| try: | |
| spectral_centroids = librosa.feature.spectral_centroid( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length, | |
| ) | |
| features['spectral_centroid'] = torch.FloatTensor([spectral_centroids.max()]) | |
| except Exception as e: | |
| features['spectral_centroid'] = torch.FloatTensor([np.nan]) | |
| # 2. Spectral Rolloff | |
| try: | |
| spectral_rolloff = librosa.feature.spectral_rolloff( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length, | |
| ) | |
| features['spectral_rolloff'] = torch.FloatTensor([spectral_rolloff.max()]) | |
| except Exception as e: | |
| features['spectral_rolloff'] = torch.FloatTensor([np.nan]) | |
| # 3. Spectral Bandwidth | |
| try: | |
| spectral_bandwidth = librosa.feature.spectral_bandwidth( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['spectral_bandwidth'] = torch.FloatTensor([spectral_bandwidth.max()]) | |
| except Exception as e: | |
| features['spectral_bandwidth'] = torch.FloatTensor([np.nan]) | |
| # 4. Spectral Contrast | |
| try: | |
| spectral_contrast = librosa.feature.spectral_contrast( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length, | |
| fmin=20, # Lower minimum frequency | |
| n_bands=4, # Reduce number of bands | |
| quantile=0.02 | |
| ) | |
| features['spectral_contrast'] = torch.FloatTensor([spectral_contrast.mean()]) | |
| except Exception as e: | |
| features['spectral_contrast'] = torch.FloatTensor([np.nan]) | |
| # 5. Spectral Flatness | |
| try: | |
| spectral_flatness = librosa.feature.spectral_flatness( | |
| y=audio_np, | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['spectral_flatness'] = torch.FloatTensor([spectral_flatness.max()]) | |
| except Exception as e: | |
| features['spectral_flatness'] = torch.FloatTensor([np.nan]) | |
| # 6. Spectral Flux | |
| try: | |
| stft = np.abs(librosa.stft(audio_np, n_fft=n_fft, hop_length=hop_length)) | |
| spectral_flux = np.diff(stft, axis=1) | |
| spectral_flux = np.pad(spectral_flux, ((0, 0), (1, 0)), mode='constant') | |
| features['spectral_flux'] = torch.FloatTensor([np.std(spectral_flux)]) | |
| except Exception as e: | |
| features['spectral_flux'] = torch.FloatTensor([np.nan]) | |
| # 7. MFCCs (Mel-Frequency Cepstral Coefficients) | |
| try: | |
| mfccs = librosa.feature.mfcc( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_mfcc=13, # Number of MFCCs to compute | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['mfcc_mean'] = torch.FloatTensor([mfccs.mean()]) | |
| except Exception as e: | |
| features['mfcc_mean'] = torch.FloatTensor([np.nan]) | |
| # 8. Chroma Features | |
| try: | |
| chroma = librosa.feature.chroma_stft( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['chroma_mean'] = torch.FloatTensor([chroma.mean()]) | |
| except Exception as e: | |
| features['chroma_mean'] = torch.FloatTensor([np.nan]) | |
| # 9. Spectral Kurtosis | |
| try: | |
| spectral_kurtosis = librosa.feature.spectral_kurtosis( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['spectral_kurtosis'] = torch.FloatTensor([spectral_kurtosis.mean()]) | |
| except Exception as e: | |
| features['spectral_kurtosis'] = torch.FloatTensor([np.nan]) | |
| # 10. Spectral Skewness | |
| try: | |
| spectral_skewness = librosa.feature.spectral_skewness( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['spectral_skewness'] = torch.FloatTensor([spectral_skewness.mean()]) | |
| except Exception as e: | |
| features['spectral_skewness'] = torch.FloatTensor([np.nan]) | |
| # 11. Spectral Slope | |
| try: | |
| spectral_slope = librosa.feature.spectral_slope( | |
| y=audio_np, | |
| sr=sample_rate, | |
| n_fft=n_fft, | |
| hop_length=hop_length | |
| ) | |
| features['spectral_slope'] = torch.FloatTensor([spectral_slope.mean()]) | |
| except Exception as e: | |
| features['spectral_slope'] = torch.FloatTensor([np.nan]) | |
| # 12. Tonnetz (Tonal Centroid Features) | |
| try: | |
| tonnetz = librosa.feature.tonnetz( | |
| y=audio_np, | |
| sr=sample_rate | |
| ) | |
| features['tonnetz_mean'] = torch.FloatTensor([tonnetz.mean()]) | |
| except Exception as e: | |
| features['tonnetz_mean'] = torch.FloatTensor([np.nan]) | |
| return features | |
| def compute_all_features(audio, sample_rate, wavelet='db1', decompos_level=4): | |
| """ | |
| Compute all available features and return them in a dictionary. | |
| """ | |
| features = {} | |
| # Basic transformations | |
| # features['wavelet'] = compute_wavelet_transform(audio, wavelet, decompos_level) | |
| # features['melspectrogram'] = compute_melspectrogram(audio, sample_rate) | |
| # features['mfcc'] = compute_mfcc(audio, sample_rate) | |
| # features['chroma'] = compute_chroma(audio, sample_rate) | |
| # features['cwt_power'] = compute_cwt_power_spectrum( | |
| # audio, | |
| # sample_rate, | |
| # num_freqs=128, # Same as mel bands for consistency | |
| # f_min=20, # Standard lower frequency bound | |
| # f_max=sample_rate // 2 # Nyquist frequency | |
| # ) | |
| # Time domain features | |
| # features['time_domain'] = compute_time_domain_features(audio, sample_rate) | |
| # Frequency domain features | |
| return compute_frequency_domain_features(audio, sample_rate) |