Spaces:
Sleeping
Sleeping
File size: 7,517 Bytes
e049981 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
"""Main song processing orchestrator."""
import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional
import sys
from app.song_conversion.vocal_separator import VocalSeparator
from app.song_conversion.audio_mixer import AudioMixer
from encoder import inference as encoder_infer
from synthesizer import inference as synthesizer_infer
from app.vocoder import inference as vocoder_infer
from synthesizer.hparams import hparams as syn_hp
class SongProcessor:
"""Orchestrates the complete song voice conversion process."""
def __init__(self, models_dir: Path):
"""
Initialize song processor.
Args:
models_dir: Directory containing pre-trained models
"""
self.models_dir = Path(models_dir)
self.separator = None
self.sr = 16000
def _ensure_separator(self) -> VocalSeparator:
"""Lazy load vocal separator."""
if self.separator is None:
print("[SongProcessor] Initializing vocal separator...")
self.separator = VocalSeparator(model_name="htdemucs")
return self.separator
def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
"""Load voice cloning models."""
print(f"[SongProcessor] Loading {language} voice models...")
enc_path = models_dir / "default" / "encoder.pt"
syn_path = models_dir / "default" / "synthesizer.pt"
voc_path = models_dir / "default" / "vocoder.pt"
for path in [enc_path, syn_path, voc_path]:
if not path.exists():
raise RuntimeError(f"Model missing: {path}")
encoder_infer.load_model(enc_path)
print("[SongProcessor] Encoder loaded")
synthesizer = synthesizer_infer.Synthesizer(syn_path)
print("[SongProcessor] Synthesizer loaded")
vocoder_infer.load_model(voc_path)
print("[SongProcessor] Vocoder loaded")
return synthesizer
def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
"""
Simple lyrics extraction (placeholder - returns generic text).
In production, would use speech-to-text.
Args:
audio_path: Path to vocal audio
voice_sample_path: Path to reference voice
Returns:
Extracted lyrics text
"""
print("[SongProcessor] Extracting lyrics from audio...")
# Placeholder: return generic phonetically rich text
# In production, use Whisper or other STT model
lyrics = "The music is playing so well with this song today"
print(f"[SongProcessor] Using default lyrics: {lyrics}")
return lyrics
def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
language: str = 'english', add_effects: bool = True,
models_dir: Optional[Path] = None) -> Path:
"""
Convert song to user's voice.
Complete pipeline:
1. Separate vocals from instrumental
2. Extract lyrics from vocals (or use placeholder)
3. Synthesize vocals using user's voice
4. Mix synthesized vocals with instrumental
5. Add audio effects
Args:
song_path: Path to input song
voice_path: Path to reference voice sample
output_path: Path for output song
language: 'english' or 'hindi'
add_effects: Whether to add reverb/compression
models_dir: Directory with models (uses self.models_dir if None)
Returns:
Path to output song
"""
if models_dir is None:
models_dir = self.models_dir
song_path = Path(song_path)
voice_path = Path(voice_path)
output_path = Path(output_path)
try:
print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
print(f"[SongProcessor] Song: {song_path}")
print(f"[SongProcessor] Voice: {voice_path}")
print(f"[SongProcessor] Language: {language}")
print(f"[SongProcessor] Output: {output_path}")
# Step 1: Separate vocals
print(f"\n[SongProcessor] STEP 1: Separating vocals...")
separator = self._ensure_separator()
vocals, instrumental = separator.separate(song_path, sr=self.sr)
# Step 2: Extract/prepare lyrics (using placeholder for now)
print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
lyrics = self._extract_lyrics_from_audio(song_path, voice_path)
# Step 3: Load voice models
print(f"\n[SongProcessor] STEP 3: Loading voice models...")
synthesizer = self._load_voice_models(models_dir, language)
# Step 4: Synthesize voice with your voice
print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
wav = encoder_infer.preprocess_wav(voice_path)
embed = encoder_infer.embed_utterance(wav)
mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
mel = mels[0]
print("[SongProcessor] Vocoding...")
try:
synthesized_vocal = vocoder_infer.infer_waveform(
mel, normalize=True, batched=False, target=8000, overlap=800
).astype(np.float32)
except Exception as e:
print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)
# Normalize synthesized vocal
max_val = np.max(np.abs(synthesized_vocal))
if max_val > 0:
target_level = 0.707
synthesized_vocal = synthesized_vocal * (target_level / max_val)
synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)
print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
# Step 5: Mix with instrumental
print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
final_audio = AudioMixer.mix_and_save(
synthesized_vocal, instrumental,
output_path, sr=self.sr,
add_effects=add_effects
)
# Cleanup
print(f"\n[SongProcessor] Cleaning up models...")
try:
encoder_infer._model = None
synthesizer_infer._model = None
vocoder_infer._model = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"[SongProcessor] Warning during cleanup: {e}")
print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
print(f"[SongProcessor] Output saved to: {final_audio}")
return final_audio
except Exception as e:
print(f"\n[SongProcessor] ✗ ERROR: {e}")
import traceback
traceback.print_exc()
sys.stdout.flush()
raise
|