Demusics / audio_processor.py
Kremon96's picture
Create audio_processor.py
a4c2add verified
# audio_processor.py
import librosa
import numpy as np
import torch
import torchaudio
import soundfile as sf
from pathlib import Path
import warnings
from config import SAMPLE_RATE, SUPPORTED_FORMATS, MAX_FILE_SIZE_MB
warnings.filterwarnings("ignore")
class AudioProcessor:
def __init__(self):
self.sample_rate = SAMPLE_RATE
def validate_audio_file(self, file_path):
"""Validate audio file before processing"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"Audio file not found: {file_path}")
if file_path.suffix.lower() not in SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {file_path.suffix}. Supported: {SUPPORTED_FORMATS}")
file_size_mb = file_path.stat().st_size / (1024 * 1024)
if file_size_mb > MAX_FILE_SIZE_MB:
raise ValueError(f"File too large: {file_size_mb:.1f}MB > {MAX_FILE_SIZE_MB}MB limit")
return True
def load_audio(self, file_path, target_sr=None):
"""Load audio file with proper error handling and resampling"""
if target_sr is None:
target_sr = self.sample_rate
try:
self.validate_audio_file(file_path)
# Load audio with librosa (handles most formats)
waveform, sr = librosa.load(file_path, sr=target_sr, mono=False)
# Convert mono to stereo if needed
if waveform.ndim == 1:
waveform = np.stack([waveform, waveform])
elif waveform.ndim == 2 and waveform.shape[0] > 2:
# Take first two channels if multi-channel
waveform = waveform[:2, :]
# Ensure correct shape: (channels, samples)
if waveform.shape[0] > waveform.shape[1]:
waveform = waveform.T
# Convert to tensor and normalize
waveform = torch.FloatTensor(waveform)
waveform = waveform / (waveform.abs().max() + 1e-8) # Normalize to [-1, 1]
return waveform, target_sr
except Exception as e:
raise Exception(f"Error loading audio {file_path}: {str(e)}")
def save_audio(self, waveform, file_path, sample_rate=None):
"""Save waveform to file with proper formatting"""
if sample_rate is None:
sample_rate = self.sample_rate
try:
# Convert tensor to numpy
if isinstance(waveform, torch.Tensor):
waveform = waveform.detach().cpu().numpy()
# Ensure proper shape: (samples, channels)
if waveform.ndim == 2 and waveform.shape[0] == 2: # (channels, samples)
waveform = waveform.T # Convert to (samples, channels)
# Normalize before saving
waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
waveform = np.clip(waveform * 0.95, -1, 1) # Prevent clipping
# Ensure directory exists
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
sf.write(file_path, waveform, sample_rate, subtype='PCM_16')
return True
except Exception as e:
raise Exception(f"Error saving audio {file_path}: {str(e)}")
def get_audio_info(self, file_path):
"""Get audio file information"""
try:
self.validate_audio_file(file_path)
info = sf.info(file_path)
return {
'duration': info.duration,
'sample_rate': info.samplerate,
'channels': info.channels,
'format': info.format
}
except Exception as e:
raise Exception(f"Error getting audio info: {str(e)}")