Spaces:
Sleeping
Sleeping
| """Main song processing orchestrator.""" | |
| import gc | |
| import torch | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import Optional | |
| import sys | |
| from app.song_conversion.vocal_separator import VocalSeparator | |
| from app.song_conversion.audio_mixer import AudioMixer | |
| from encoder import inference as encoder_infer | |
| from synthesizer import inference as synthesizer_infer | |
| from app.vocoder import inference as vocoder_infer | |
| from synthesizer.hparams import hparams as syn_hp | |
| class SongProcessor: | |
| """Orchestrates the complete song voice conversion process.""" | |
| def __init__(self, models_dir: Path): | |
| """ | |
| Initialize song processor. | |
| Args: | |
| models_dir: Directory containing pre-trained models | |
| """ | |
| self.models_dir = Path(models_dir) | |
| self.separator = None | |
| self.sr = 16000 | |
| def _ensure_separator(self) -> VocalSeparator: | |
| """Lazy load vocal separator.""" | |
| if self.separator is None: | |
| print("[SongProcessor] Initializing vocal separator...") | |
| self.separator = VocalSeparator(model_name="htdemucs") | |
| return self.separator | |
| def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None: | |
| """Load voice cloning models.""" | |
| print(f"[SongProcessor] Loading {language} voice models...") | |
| enc_path = models_dir / "default" / "encoder.pt" | |
| syn_path = models_dir / "default" / "synthesizer.pt" | |
| voc_path = models_dir / "default" / "vocoder.pt" | |
| for path in [enc_path, syn_path, voc_path]: | |
| if not path.exists(): | |
| raise RuntimeError(f"Model missing: {path}") | |
| encoder_infer.load_model(enc_path) | |
| print("[SongProcessor] Encoder loaded") | |
| synthesizer = synthesizer_infer.Synthesizer(syn_path) | |
| print("[SongProcessor] Synthesizer loaded") | |
| vocoder_infer.load_model(voc_path) | |
| print("[SongProcessor] Vocoder loaded") | |
| return synthesizer | |
| def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str: | |
| """ | |
| Simple lyrics extraction (placeholder - returns generic text). | |
| In production, would use speech-to-text. | |
| Args: | |
| audio_path: Path to vocal audio | |
| voice_sample_path: Path to reference voice | |
| Returns: | |
| Extracted lyrics text | |
| """ | |
| print("[SongProcessor] Extracting lyrics from audio...") | |
| # Placeholder: return generic phonetically rich text | |
| # In production, use Whisper or other STT model | |
| lyrics = "The music is playing so well with this song today" | |
| print(f"[SongProcessor] Using default lyrics: {lyrics}") | |
| return lyrics | |
| def convert_song(self, song_path: Path, voice_path: Path, output_path: Path, | |
| language: str = 'english', add_effects: bool = True, | |
| models_dir: Optional[Path] = None) -> Path: | |
| """ | |
| Convert song to user's voice. | |
| Complete pipeline: | |
| 1. Separate vocals from instrumental | |
| 2. Extract lyrics from vocals (or use placeholder) | |
| 3. Synthesize vocals using user's voice | |
| 4. Mix synthesized vocals with instrumental | |
| 5. Add audio effects | |
| Args: | |
| song_path: Path to input song | |
| voice_path: Path to reference voice sample | |
| output_path: Path for output song | |
| language: 'english' or 'hindi' | |
| add_effects: Whether to add reverb/compression | |
| models_dir: Directory with models (uses self.models_dir if None) | |
| Returns: | |
| Path to output song | |
| """ | |
| if models_dir is None: | |
| models_dir = self.models_dir | |
| song_path = Path(song_path) | |
| voice_path = Path(voice_path) | |
| output_path = Path(output_path) | |
| try: | |
| print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========") | |
| print(f"[SongProcessor] Song: {song_path}") | |
| print(f"[SongProcessor] Voice: {voice_path}") | |
| print(f"[SongProcessor] Language: {language}") | |
| print(f"[SongProcessor] Output: {output_path}") | |
| # Step 1: Separate vocals | |
| print(f"\n[SongProcessor] STEP 1: Separating vocals...") | |
| separator = self._ensure_separator() | |
| vocals, instrumental = separator.separate(song_path, sr=self.sr) | |
| # Step 2: Extract/prepare lyrics (using placeholder for now) | |
| print(f"\n[SongProcessor] STEP 2: Preparing lyrics...") | |
| lyrics = self._extract_lyrics_from_audio(song_path, voice_path) | |
| # Step 3: Load voice models | |
| print(f"\n[SongProcessor] STEP 3: Loading voice models...") | |
| synthesizer = self._load_voice_models(models_dir, language) | |
| # Step 4: Synthesize voice with your voice | |
| print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...") | |
| wav = encoder_infer.preprocess_wav(voice_path) | |
| embed = encoder_infer.embed_utterance(wav) | |
| mels = synthesizer.synthesize_spectrograms([lyrics], [embed]) | |
| mel = mels[0] | |
| print("[SongProcessor] Vocoding...") | |
| try: | |
| synthesized_vocal = vocoder_infer.infer_waveform( | |
| mel, normalize=True, batched=False, target=8000, overlap=800 | |
| ).astype(np.float32) | |
| except Exception as e: | |
| print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback") | |
| synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32) | |
| # Normalize synthesized vocal | |
| max_val = np.max(np.abs(synthesized_vocal)) | |
| if max_val > 0: | |
| target_level = 0.707 | |
| synthesized_vocal = synthesized_vocal * (target_level / max_val) | |
| synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0) | |
| print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}") | |
| # Step 5: Mix with instrumental | |
| print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...") | |
| final_audio = AudioMixer.mix_and_save( | |
| synthesized_vocal, instrumental, | |
| output_path, sr=self.sr, | |
| add_effects=add_effects | |
| ) | |
| # Cleanup | |
| print(f"\n[SongProcessor] Cleaning up models...") | |
| try: | |
| encoder_infer._model = None | |
| synthesizer_infer._model = None | |
| vocoder_infer._model = None | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| except Exception as e: | |
| print(f"[SongProcessor] Warning during cleanup: {e}") | |
| print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========") | |
| print(f"[SongProcessor] Output saved to: {final_audio}") | |
| return final_audio | |
| except Exception as e: | |
| print(f"\n[SongProcessor] ✗ ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.stdout.flush() | |
| raise | |