AJ50's picture
Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing
e049981
"""Main song processing orchestrator."""
import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional
import sys
from app.song_conversion.vocal_separator import VocalSeparator
from app.song_conversion.audio_mixer import AudioMixer
from encoder import inference as encoder_infer
from synthesizer import inference as synthesizer_infer
from app.vocoder import inference as vocoder_infer
from synthesizer.hparams import hparams as syn_hp
class SongProcessor:
"""Orchestrates the complete song voice conversion process."""
def __init__(self, models_dir: Path):
"""
Initialize song processor.
Args:
models_dir: Directory containing pre-trained models
"""
self.models_dir = Path(models_dir)
self.separator = None
self.sr = 16000
def _ensure_separator(self) -> VocalSeparator:
"""Lazy load vocal separator."""
if self.separator is None:
print("[SongProcessor] Initializing vocal separator...")
self.separator = VocalSeparator(model_name="htdemucs")
return self.separator
def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
"""Load voice cloning models."""
print(f"[SongProcessor] Loading {language} voice models...")
enc_path = models_dir / "default" / "encoder.pt"
syn_path = models_dir / "default" / "synthesizer.pt"
voc_path = models_dir / "default" / "vocoder.pt"
for path in [enc_path, syn_path, voc_path]:
if not path.exists():
raise RuntimeError(f"Model missing: {path}")
encoder_infer.load_model(enc_path)
print("[SongProcessor] Encoder loaded")
synthesizer = synthesizer_infer.Synthesizer(syn_path)
print("[SongProcessor] Synthesizer loaded")
vocoder_infer.load_model(voc_path)
print("[SongProcessor] Vocoder loaded")
return synthesizer
def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
"""
Simple lyrics extraction (placeholder - returns generic text).
In production, would use speech-to-text.
Args:
audio_path: Path to vocal audio
voice_sample_path: Path to reference voice
Returns:
Extracted lyrics text
"""
print("[SongProcessor] Extracting lyrics from audio...")
# Placeholder: return generic phonetically rich text
# In production, use Whisper or other STT model
lyrics = "The music is playing so well with this song today"
print(f"[SongProcessor] Using default lyrics: {lyrics}")
return lyrics
def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
language: str = 'english', add_effects: bool = True,
models_dir: Optional[Path] = None) -> Path:
"""
Convert song to user's voice.
Complete pipeline:
1. Separate vocals from instrumental
2. Extract lyrics from vocals (or use placeholder)
3. Synthesize vocals using user's voice
4. Mix synthesized vocals with instrumental
5. Add audio effects
Args:
song_path: Path to input song
voice_path: Path to reference voice sample
output_path: Path for output song
language: 'english' or 'hindi'
add_effects: Whether to add reverb/compression
models_dir: Directory with models (uses self.models_dir if None)
Returns:
Path to output song
"""
if models_dir is None:
models_dir = self.models_dir
song_path = Path(song_path)
voice_path = Path(voice_path)
output_path = Path(output_path)
try:
print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
print(f"[SongProcessor] Song: {song_path}")
print(f"[SongProcessor] Voice: {voice_path}")
print(f"[SongProcessor] Language: {language}")
print(f"[SongProcessor] Output: {output_path}")
# Step 1: Separate vocals
print(f"\n[SongProcessor] STEP 1: Separating vocals...")
separator = self._ensure_separator()
vocals, instrumental = separator.separate(song_path, sr=self.sr)
# Step 2: Extract/prepare lyrics (using placeholder for now)
print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
lyrics = self._extract_lyrics_from_audio(song_path, voice_path)
# Step 3: Load voice models
print(f"\n[SongProcessor] STEP 3: Loading voice models...")
synthesizer = self._load_voice_models(models_dir, language)
# Step 4: Synthesize voice with your voice
print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
wav = encoder_infer.preprocess_wav(voice_path)
embed = encoder_infer.embed_utterance(wav)
mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
mel = mels[0]
print("[SongProcessor] Vocoding...")
try:
synthesized_vocal = vocoder_infer.infer_waveform(
mel, normalize=True, batched=False, target=8000, overlap=800
).astype(np.float32)
except Exception as e:
print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)
# Normalize synthesized vocal
max_val = np.max(np.abs(synthesized_vocal))
if max_val > 0:
target_level = 0.707
synthesized_vocal = synthesized_vocal * (target_level / max_val)
synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)
print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
# Step 5: Mix with instrumental
print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
final_audio = AudioMixer.mix_and_save(
synthesized_vocal, instrumental,
output_path, sr=self.sr,
add_effects=add_effects
)
# Cleanup
print(f"\n[SongProcessor] Cleaning up models...")
try:
encoder_infer._model = None
synthesizer_infer._model = None
vocoder_infer._model = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"[SongProcessor] Warning during cleanup: {e}")
print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
print(f"[SongProcessor] Output saved to: {final_audio}")
return final_audio
except Exception as e:
print(f"\n[SongProcessor] ✗ ERROR: {e}")
import traceback
traceback.print_exc()
sys.stdout.flush()
raise