Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

voice-cloning-backend / backend /app /song_conversion /song_processor.py

AJ50

Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing

e049981 20 days ago

raw

history blame contribute delete

7.52 kB

	"""Main song processing orchestrator."""

	import gc
	import torch
	import numpy as np
	from pathlib import Path
	from typing import Optional
	import sys

	from app.song_conversion.vocal_separator import VocalSeparator
	from app.song_conversion.audio_mixer import AudioMixer
	from encoder import inference as encoder_infer
	from synthesizer import inference as synthesizer_infer
	from app.vocoder import inference as vocoder_infer
	from synthesizer.hparams import hparams as syn_hp


	class SongProcessor:
	"""Orchestrates the complete song voice conversion process."""

	def __init__(self, models_dir: Path):
	"""
	Initialize song processor.

	Args:
	models_dir: Directory containing pre-trained models
	"""
	self.models_dir = Path(models_dir)
	self.separator = None
	self.sr = 16000

	def _ensure_separator(self) -> VocalSeparator:
	"""Lazy load vocal separator."""
	if self.separator is None:
	print("[SongProcessor] Initializing vocal separator...")
	self.separator = VocalSeparator(model_name="htdemucs")
	return self.separator

	def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
	"""Load voice cloning models."""
	print(f"[SongProcessor] Loading {language} voice models...")

	enc_path = models_dir / "default" / "encoder.pt"
	syn_path = models_dir / "default" / "synthesizer.pt"
	voc_path = models_dir / "default" / "vocoder.pt"

	for path in [enc_path, syn_path, voc_path]:
	if not path.exists():
	raise RuntimeError(f"Model missing: {path}")

	encoder_infer.load_model(enc_path)
	print("[SongProcessor] Encoder loaded")

	synthesizer = synthesizer_infer.Synthesizer(syn_path)
	print("[SongProcessor] Synthesizer loaded")

	vocoder_infer.load_model(voc_path)
	print("[SongProcessor] Vocoder loaded")

	return synthesizer

	def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
	"""
	Simple lyrics extraction (placeholder - returns generic text).
	In production, would use speech-to-text.

	Args:
	audio_path: Path to vocal audio
	voice_sample_path: Path to reference voice

	Returns:
	Extracted lyrics text
	"""
	print("[SongProcessor] Extracting lyrics from audio...")

	# Placeholder: return generic phonetically rich text
	# In production, use Whisper or other STT model
	lyrics = "The music is playing so well with this song today"

	print(f"[SongProcessor] Using default lyrics: {lyrics}")
	return lyrics

	def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
	language: str = 'english', add_effects: bool = True,
	models_dir: Optional[Path] = None) -> Path:
	"""
	Convert song to user's voice.

	Complete pipeline:
	1. Separate vocals from instrumental
	2. Extract lyrics from vocals (or use placeholder)
	3. Synthesize vocals using user's voice
	4. Mix synthesized vocals with instrumental
	5. Add audio effects

	Args:
	song_path: Path to input song
	voice_path: Path to reference voice sample
	output_path: Path for output song
	language: 'english' or 'hindi'
	add_effects: Whether to add reverb/compression
	models_dir: Directory with models (uses self.models_dir if None)

	Returns:
	Path to output song
	"""
	if models_dir is None:
	models_dir = self.models_dir

	song_path = Path(song_path)
	voice_path = Path(voice_path)
	output_path = Path(output_path)

	try:
	print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
	print(f"[SongProcessor] Song: {song_path}")
	print(f"[SongProcessor] Voice: {voice_path}")
	print(f"[SongProcessor] Language: {language}")
	print(f"[SongProcessor] Output: {output_path}")

	# Step 1: Separate vocals
	print(f"\n[SongProcessor] STEP 1: Separating vocals...")
	separator = self._ensure_separator()
	vocals, instrumental = separator.separate(song_path, sr=self.sr)

	# Step 2: Extract/prepare lyrics (using placeholder for now)
	print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
	lyrics = self._extract_lyrics_from_audio(song_path, voice_path)

	# Step 3: Load voice models
	print(f"\n[SongProcessor] STEP 3: Loading voice models...")
	synthesizer = self._load_voice_models(models_dir, language)

	# Step 4: Synthesize voice with your voice
	print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
	wav = encoder_infer.preprocess_wav(voice_path)
	embed = encoder_infer.embed_utterance(wav)

	mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
	mel = mels[0]

	print("[SongProcessor] Vocoding...")
	try:
	synthesized_vocal = vocoder_infer.infer_waveform(
	mel, normalize=True, batched=False, target=8000, overlap=800
	).astype(np.float32)
	except Exception as e:
	print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
	synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)

	# Normalize synthesized vocal
	max_val = np.max(np.abs(synthesized_vocal))
	if max_val > 0:
	target_level = 0.707
	synthesized_vocal = synthesized_vocal * (target_level / max_val)
	synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)

	print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")

	# Step 5: Mix with instrumental
	print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
	final_audio = AudioMixer.mix_and_save(
	synthesized_vocal, instrumental,
	output_path, sr=self.sr,
	add_effects=add_effects
	)

	# Cleanup
	print(f"\n[SongProcessor] Cleaning up models...")
	try:
	encoder_infer._model = None
	synthesizer_infer._model = None
	vocoder_infer._model = None
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	except Exception as e:
	print(f"[SongProcessor] Warning during cleanup: {e}")

	print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
	print(f"[SongProcessor] Output saved to: {final_audio}")

	return final_audio

	except Exception as e:
	print(f"\n[SongProcessor] ✗ ERROR: {e}")
	import traceback
	traceback.print_exc()
	sys.stdout.flush()
	raise