SolfegeScore-Singer-01 / backend /audio_mixer.py
JeffreyZhou798's picture
Upload 8 files
ecadc11 verified
"""
Audio Mixer Module
Mixes multiple voice tracks into a single output
"""
import numpy as np
from typing import List, Optional
def mix_voices(
voice_audios: List[np.ndarray],
method: str = "sum",
normalize: bool = True
) -> np.ndarray:
"""
Mix multiple voice audio tracks.
Args:
voice_audios: List of audio arrays (one per voice)
method: Mixing method ("sum", "average", "weighted")
normalize: Whether to normalize output
Returns:
Mixed audio array
"""
if not voice_audios:
return np.zeros(44100) # 1 second silence
if len(voice_audios) == 1:
audio = voice_audios[0]
if normalize:
audio = normalize_audio(audio)
return audio
# Find maximum length
max_length = max(len(audio) for audio in voice_audios)
# Pad shorter audios with silence
padded_audios = []
for audio in voice_audios:
if len(audio) < max_length:
padding = np.zeros(max_length - len(audio))
padded_audio = np.concatenate([audio, padding])
else:
padded_audio = audio
padded_audios.append(padded_audio)
# Mix
if method == "sum":
mixed = np.sum(padded_audios, axis=0)
elif method == "average":
mixed = np.mean(padded_audios, axis=0)
elif method == "weighted":
# Weight by inverse of energy (quieter voices get higher weight)
energies = [np.sum(audio ** 2) for audio in padded_audios]
weights = [1.0 / (e + 1e-10) for e in energies]
total_weight = sum(weights)
weights = [w / total_weight for w in weights]
mixed = np.zeros(max_length)
for audio, weight in zip(padded_audios, weights):
mixed += audio * weight
else:
mixed = np.sum(padded_audios, axis=0)
# Normalize
if normalize:
mixed = normalize_audio(mixed)
return mixed
def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
"""
Normalize audio to target dB level.
Args:
audio: Audio array
target_db: Target dB level (default -3.0 dB)
Returns:
Normalized audio
"""
# Calculate current RMS
rms = np.sqrt(np.mean(audio ** 2))
if rms < 1e-10:
return audio # Avoid division by zero
# Calculate target RMS
target_rms = 10 ** (target_db / 20) * 0.1
# Apply gain
gain = target_rms / rms
normalized = audio * gain
# Clip to prevent overflow
normalized = np.clip(normalized, -1.0, 1.0)
return normalized
def apply_fade(audio: np.ndarray, fade_in: float = 0.01, fade_out: float = 0.01, sample_rate: int = 44100) -> np.ndarray:
"""
Apply fade in/out to audio.
Args:
audio: Audio array
fade_in: Fade in duration (seconds)
fade_out: Fade out duration (seconds)
sample_rate: Sample rate
Returns:
Audio with fades applied
"""
audio = audio.copy()
# Fade in
fade_in_samples = int(fade_in * sample_rate)
if fade_in_samples > 0 and fade_in_samples < len(audio):
fade_in_curve = np.linspace(0, 1, fade_in_samples)
audio[:fade_in_samples] *= fade_in_curve
# Fade out
fade_out_samples = int(fade_out * sample_rate)
if fade_out_samples > 0 and fade_out_samples < len(audio):
fade_out_curve = np.linspace(1, 0, fade_out_samples)
audio[-fade_out_samples:] *= fade_out_curve
return audio