aadhi97x's picture
clean commit
6c1314b
import base64
import io
import librosa
import numpy as np
import torch
import torchaudio
import soundfile as sf
def decode_audio(base64_string: str):
"""
Decodes a base64 string into an in-memory audio file-like object.
"""
try:
audio_data = base64.b64decode(base64_string)
return io.BytesIO(audio_data)
except Exception as e:
raise ValueError(f"Invalid Base64 audio data: {str(e)}")
def load_audio(file_obj, target_sr=16000):
"""
Loads audio from a file object using librosa/torchaudio.
Returns:
waveform (torch.Tensor): Audio waveform
sr (int): Sample rate
"""
# Load using librosa for robust format handling (MP3, etc)
y, sr = librosa.load(file_obj, sr=target_sr)
# Noise Reduction (Basic spectral gating) to reduce false positives from background noise
try:
import noisereduce as nr
# Assume noise is estimated from the whole clip (stationary)
y = nr.reduce_noise(y=y, sr=sr, stationary=True, prop_decrease=0.75)
except Exception as e:
print(f"Warning: Noise reduction failed: {e}")
# Convert to tensor
waveform = torch.tensor(y).unsqueeze(0) # (1, time)
return waveform, sr
def extract_heuristic_features(y, sr):
"""
Extracts simple spectral features for explainability.
"""
# Spectral Centroid
cent = librosa.feature.spectral_centroid(y=y, sr=sr)
mean_cent = np.mean(cent)
# Spectral Rolloff
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
mean_rolloff = np.mean(rolloff)
# Zero Crossing Rate
zcr = librosa.feature.zero_crossing_rate(y)
mean_zcr = np.mean(zcr)
return {
"spectral_centroid": float(mean_cent),
"spectral_rolloff": float(mean_rolloff),
"zero_crossing_rate": float(mean_zcr)
}