File size: 7,517 Bytes
e049981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""Main song processing orchestrator."""

import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional
import sys

from app.song_conversion.vocal_separator import VocalSeparator
from app.song_conversion.audio_mixer import AudioMixer
from encoder import inference as encoder_infer
from synthesizer import inference as synthesizer_infer
from app.vocoder import inference as vocoder_infer
from synthesizer.hparams import hparams as syn_hp


class SongProcessor:
    """Orchestrates the complete song voice conversion process."""
    
    def __init__(self, models_dir: Path):
        """
        Initialize song processor.
        
        Args:
            models_dir: Directory containing pre-trained models
        """
        self.models_dir = Path(models_dir)
        self.separator = None
        self.sr = 16000
    
    def _ensure_separator(self) -> VocalSeparator:
        """Lazy load vocal separator."""
        if self.separator is None:
            print("[SongProcessor] Initializing vocal separator...")
            self.separator = VocalSeparator(model_name="htdemucs")
        return self.separator
    
    def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
        """Load voice cloning models."""
        print(f"[SongProcessor] Loading {language} voice models...")
        
        enc_path = models_dir / "default" / "encoder.pt"
        syn_path = models_dir / "default" / "synthesizer.pt"
        voc_path = models_dir / "default" / "vocoder.pt"
        
        for path in [enc_path, syn_path, voc_path]:
            if not path.exists():
                raise RuntimeError(f"Model missing: {path}")
        
        encoder_infer.load_model(enc_path)
        print("[SongProcessor] Encoder loaded")
        
        synthesizer = synthesizer_infer.Synthesizer(syn_path)
        print("[SongProcessor] Synthesizer loaded")
        
        vocoder_infer.load_model(voc_path)
        print("[SongProcessor] Vocoder loaded")
        
        return synthesizer
    
    def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
        """
        Simple lyrics extraction (placeholder - returns generic text).
        In production, would use speech-to-text.
        
        Args:
            audio_path: Path to vocal audio
            voice_sample_path: Path to reference voice
            
        Returns:
            Extracted lyrics text
        """
        print("[SongProcessor] Extracting lyrics from audio...")
        
        # Placeholder: return generic phonetically rich text
        # In production, use Whisper or other STT model
        lyrics = "The music is playing so well with this song today"
        
        print(f"[SongProcessor] Using default lyrics: {lyrics}")
        return lyrics
    
    def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
                    language: str = 'english', add_effects: bool = True,
                    models_dir: Optional[Path] = None) -> Path:
        """
        Convert song to user's voice.
        
        Complete pipeline:
        1. Separate vocals from instrumental
        2. Extract lyrics from vocals (or use placeholder)
        3. Synthesize vocals using user's voice
        4. Mix synthesized vocals with instrumental
        5. Add audio effects
        
        Args:
            song_path: Path to input song
            voice_path: Path to reference voice sample
            output_path: Path for output song
            language: 'english' or 'hindi'
            add_effects: Whether to add reverb/compression
            models_dir: Directory with models (uses self.models_dir if None)
            
        Returns:
            Path to output song
        """
        if models_dir is None:
            models_dir = self.models_dir
        
        song_path = Path(song_path)
        voice_path = Path(voice_path)
        output_path = Path(output_path)
        
        try:
            print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
            print(f"[SongProcessor] Song: {song_path}")
            print(f"[SongProcessor] Voice: {voice_path}")
            print(f"[SongProcessor] Language: {language}")
            print(f"[SongProcessor] Output: {output_path}")
            
            # Step 1: Separate vocals
            print(f"\n[SongProcessor] STEP 1: Separating vocals...")
            separator = self._ensure_separator()
            vocals, instrumental = separator.separate(song_path, sr=self.sr)
            
            # Step 2: Extract/prepare lyrics (using placeholder for now)
            print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
            lyrics = self._extract_lyrics_from_audio(song_path, voice_path)
            
            # Step 3: Load voice models
            print(f"\n[SongProcessor] STEP 3: Loading voice models...")
            synthesizer = self._load_voice_models(models_dir, language)
            
            # Step 4: Synthesize voice with your voice
            print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
            wav = encoder_infer.preprocess_wav(voice_path)
            embed = encoder_infer.embed_utterance(wav)
            
            mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
            mel = mels[0]
            
            print("[SongProcessor] Vocoding...")
            try:
                synthesized_vocal = vocoder_infer.infer_waveform(
                    mel, normalize=True, batched=False, target=8000, overlap=800
                ).astype(np.float32)
            except Exception as e:
                print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
                synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)
            
            # Normalize synthesized vocal
            max_val = np.max(np.abs(synthesized_vocal))
            if max_val > 0:
                target_level = 0.707
                synthesized_vocal = synthesized_vocal * (target_level / max_val)
            synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)
            
            print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
            
            # Step 5: Mix with instrumental
            print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
            final_audio = AudioMixer.mix_and_save(
                synthesized_vocal, instrumental,
                output_path, sr=self.sr,
                add_effects=add_effects
            )
            
            # Cleanup
            print(f"\n[SongProcessor] Cleaning up models...")
            try:
                encoder_infer._model = None
                synthesizer_infer._model = None
                vocoder_infer._model = None
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"[SongProcessor] Warning during cleanup: {e}")
            
            print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
            print(f"[SongProcessor] Output saved to: {final_audio}")
            
            return final_audio
            
        except Exception as e:
            print(f"\n[SongProcessor] ✗ ERROR: {e}")
            import traceback
            traceback.print_exc()
            sys.stdout.flush()
            raise