piranware_v2 / src /preprocess.py
ashandilgith's picture
first commit
cafae0b
import librosa
import numpy as np
# Audio Settings
SAMPLE_RATE = 22050
DURATION = 1.0 # 1-second slices
SAMPLES_PER_SLICE = int(SAMPLE_RATE * DURATION)
N_MELS = 128
def audio_to_spectrograms(file_path):
"""
1. Loads audio.
2. Slices it into 1-second chunks.
3. Converts each chunk to a Mel-Spectrogram (Image).
Returns: Numpy array of shape (Num_Slices, 128, 44, 1)
"""
try:
y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
# Calculate how many full 1-second slices we can get
num_slices = len(y) // SAMPLES_PER_SLICE
if num_slices < 1:
return None # Audio too short
spectrograms = []
for i in range(num_slices):
start = i * SAMPLES_PER_SLICE
end = start + SAMPLES_PER_SLICE
y_slice = y[start:end]
# Create Mel Spectrogram
spec = librosa.feature.melspectrogram(y=y_slice, sr=sr, n_mels=N_MELS)
log_spec = librosa.power_to_db(spec, ref=np.max)
# Normalize to 0-1 range (Neural Networks love 0-1)
# We assume a dynamic range of 80dB for normalization
norm_spec = (log_spec + 80) / 80
norm_spec = np.clip(norm_spec, 0, 1)
# Add Channel Dimension (Height, Width, Channel)
norm_spec = norm_spec[..., np.newaxis]
spectrograms.append(norm_spec)
return np.array(spectrograms)
except Exception as e:
print(f"Error processing audio: {e}")
return None