Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import json | |
| from typing import Tuple, Optional | |
| import subprocess | |
| import shutil | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # NLTK download for 'punkt' tokenizer data | |
| import nltk | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except nltk.downloader.DownloadError: | |
| nltk.download('punkt') | |
| # Import audio processing libraries | |
| try: | |
| from demucs.pretrained import get_model | |
| from demucs.apply import apply_model | |
| DEMUCS_AVAILABLE = True | |
| except ImportError: | |
| DEMUCS_AVAILABLE = False | |
| print("Demucs not available, using basic separation") | |
| try: | |
| import so_vits_svc_fork as svc | |
| SVC_AVAILABLE = True | |
| except ImportError: | |
| SVC_AVAILABLE = False | |
| print("SVC not available, using basic voice conversion") | |
| class AICoverGenerator: | |
| def \ | |
| __init__(self): | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.temp_dir = tempfile.mkdtemp() | |
| self.voice_models = { | |
| "drake": "Drake Style Voice", | |
| "ariana": "Ariana Style Voice", | |
| "weeknd": "The Weeknd Style Voice", | |
| "taylor": "Taylor Swift Style Voice", | |
| "custom": "Custom Voice Model" | |
| } | |
| # Initialize audio separation model | |
| if DEMUCS_AVAILABLE: | |
| try: | |
| self.separation_model = get_model('htdemucs') | |
| self.separation_model.to(self.device) | |
| except Exception as e: | |
| print(f"Error loading Demucs: {e}") | |
| self.separation_model = None | |
| else: | |
| self.separation_model = None | |
| def separate_vocals(self, audio_path: str) -> Tuple[str, str]: | |
| """Separate vocals and instrumentals from audio""" | |
| try: | |
| # Load audio | |
| audio, sr = librosa.load(audio_path, sr=44100, mono=False) | |
| if self.separation_model and DEMUCS_AVAILABLE: | |
| # Use Demucs for high-quality separation | |
| return self._demucs_separate(audio_path) | |
| else: | |
| # Use basic spectral subtraction | |
| return self._basic_separate(audio, sr) | |
| except Exception as e: | |
| print(f"Error in vocal separation: {e}") | |
| return None, None | |
| def _demucs_separate(self, audio_path: str) -> Tuple[str, str]: | |
| """Use Demucs for audio separation""" | |
| try: | |
| # Load audio for Demucs | |
| audio, sr = librosa.load(audio_path, sr=44100, mono=False) | |
| if audio.ndim == 1: | |
| audio = np.stack([audio, audio]) | |
| # Convert to tensor | |
| audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) | |
| # Apply separation | |
| with torch.no_grad(): | |
| sources = apply_model(self.separation_model, audio_tensor) | |
| # Extract vocals and instrumental | |
| vocals = sources[0, 3].cpu().numpy() # vocals channel | |
| instrumental = sources[0, 0].cpu().numpy() # drums + bass + other | |
| # Save separated audio | |
| vocals_path = os.path.join(self.temp_dir, "vocals.wav") | |
| instrumental_path = os.path.join(self.temp_dir, "instrumental.wav") | |
| sf.write(vocals_path, vocals.T, 44100) | |
| sf.write(instrumental_path, instrumental.T, 44100) | |
| return vocals_path, instrumental_path | |
| except Exception as e: | |
| print(f"Demucs separation error: {e}") | |
| return self._basic_separate(audio, 44100) | |
| def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]: | |
| """Basic vocal separation using spectral subtraction""" | |
| try: | |
| # Convert to mono if stereo | |
| if audio.ndim > 1: | |
| audio = librosa.to_mono(audio) | |
| # Compute STFT | |
| stft = librosa.stft(audio, n_fft=2048, hop_length=512) | |
| magnitude, phase = np.abs(stft), np.angle(stft) | |
| # Simple vocal isolation (center channel extraction) | |
| # This is a basic approach - real implementation would be more sophisticated | |
| vocal_mask = np.ones_like(magnitude) | |
| vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies | |
| vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies | |
| # Apply mask | |
| vocal_magnitude = magnitude * vocal_mask | |
| instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7) | |
| # Reconstruct audio | |
| vocal_stft = vocal_magnitude * np.exp(1j * phase) | |
| instrumental_stft = instrumental_magnitude * np.exp(1j * phase) | |
| vocals = librosa.istft(vocal_stft, hop_length=512) | |
| instrumental = librosa.istft(instrumental_stft, hop_length=512) | |
| # Save files | |
| vocals_path = os.path.join(self.temp_dir, "vocals.wav") | |
| instrumental_path = os.path.join(self.temp_dir, "instrumental.wav") | |
| sf.write(vocals_path, vocals, sr) | |
| sf.write(instrumental_path, instrumental, sr) | |
| return vocals_path, instrumental_path | |
| except Exception as e: | |
| print(f"Basic separation error: {e}") | |
| return None, None | |
| def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str: | |
| """Convert vocals to target voice""" | |
| try: | |
| # Load vocal audio | |
| vocals, sr = librosa.load(vocals_path, sr=44100) | |
| # Apply pitch shifting if requested | |
| if pitch_shift != 0: | |
| vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift) | |
| # Simulate voice conversion (in real app, this would use trained models) | |
| converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength) | |
| # Save converted vocals | |
| converted_path = os.path.join(self.temp_dir, "converted_vocals.wav") | |
| sf.write(converted_path, converted_vocals, sr) | |
| return converted_path | |
| except Exception as e: | |
| print(f"Voice conversion error: {e}") | |
| return vocals_path # Return original if conversion fails | |
| def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray: | |
| """Simulate voice conversion \ | |
| (placeholder for actual model inference)""" | |
| # This is a simplified simulation - real implementation would use trained models | |
| # Apply different effects based on voice model | |
| if voice_model == "drake": | |
| # Simulate Drake's voice characteristics | |
| vocals = self._apply_voice_characteristics(vocals, | |
| pitch_factor=0.85, | |
| formant_shift=-0.1, | |
| roughness=0.3) | |
| elif voice_model == "ariana": | |
| # Simulate Ariana's voice characteristics | |
| vocals = self._apply_voice_characteristics(vocals, | |
| pitch_factor=1.2, | |
| formant_shift=0.2, | |
| breathiness=0.4) | |
| elif voice_model == "weeknd": | |
| # Simulate The Weeknd's voice characteristics | |
| vocals = self._apply_voice_characteristics(vocals, | |
| pitch_factor=0.9, | |
| formant_shift=-0.05, | |
| reverb=0.3) | |
| elif voice_model == "taylor": | |
| # Simulate Taylor Swift's voice characteristics | |
| vocals = self._apply_voice_characteristics(vocals, | |
| pitch_factor=1.1, | |
| formant_shift=0.1, | |
| clarity=0.8) | |
| # Blend with original based on strength | |
| return vocals * strength + vocals * (1 - strength) * 0.3 | |
| def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray: | |
| """Apply voice characteristics transformation""" | |
| sr = 44100 | |
| # Apply pitch factor | |
| if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0: | |
| vocals = librosa.effects.pitch_shift(vocals, sr=sr, | |
| n_steps=12 * np.log2(kwargs['pitch_factor'])) | |
| # Apply formant shifting (simplified) | |
| if 'formant_shift' in kwargs: | |
| # This is a simplified formant shift - real implementation would be more complex | |
| stft = librosa.stft(vocals) | |
| magnitude = np.abs(stft) | |
| phase = np.angle(stft) | |
| # Shift formants by stretching frequency axis | |
| shift_factor = 1 + kwargs['formant_shift'] | |
| shifted_magnitude = np.zeros_like(magnitude) | |
| for i in range(magnitude.shape[0]): | |
| shifted_idx = int(i * shift_factor) | |
| if shifted_idx < magnitude.shape[0]: | |
| shifted_magnitude[shifted_idx] = magnitude[i] | |
| shifted_stft = shifted_magnitude * np.exp(1j * phase) | |
| vocals = librosa.istft(shifted_stft) | |
| # Apply effects | |
| if 'roughness' in kwargs: | |
| # Add slight distortion for roughness | |
| vocals = np.tanh(vocals * (1 + kwargs['roughness'])) | |
| if 'breathiness' in kwargs: | |
| # Add noise for breathiness | |
| noise = np.random.normal(0, 0.01, vocals.shape) | |
| vocals = vocals + noise * kwargs['breathiness'] | |
| return vocals | |
| def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str: | |
| """Mix instrumental and converted vocals""" | |
| try: | |
| # Load audio files | |
| instrumental, sr = librosa.load(instrumental_path, sr=44100) | |
| vocals, _ = librosa.load(vocals_path, sr=44100) | |
| # Ensure same length | |
| min_len = min(len(instrumental), len(vocals)) | |
| instrumental = instrumental[:min_len] | |
| vocals = vocals[:min_len] | |
| # Mix audio | |
| mixed = instrumental + vocals * vocal_volume | |
| # Normalize to prevent clipping | |
| max_amplitude = np.max(np.abs(mixed)) | |
| if max_amplitude > 0.95: | |
| mixed = mixed / max_amplitude * 0.95 | |
| # Save mixed audio | |
| output_path = os.path.join(self.temp_dir, "final_cover.wav") | |
| sf.write(output_path, mixed, sr) | |
| return output_path | |
| except Exception as e: | |
| print(f"Audio mixing error: {e}") | |
| return None | |
| def process_custom_voice(self, voice_samples: list) -> str: | |
| """Process custom voice samples for training""" | |
| if not voice_samples: | |
| return "No voice samples provided" | |
| try: | |
| # In a real implementation, this would train a voice model | |
| # For demo, we'll just validate the samples | |
| total_duration = 0 | |
| for sample in voice_samples: | |
| if sample is not None: | |
| audio, sr = librosa.load(sample, sr=44100) | |
| duration = len(audio) / sr | |
| total_duration += duration | |
| if total_duration < 30: | |
| return "Need at least 30 seconds of voice samples" | |
| elif total_duration > 300: | |
| return "Voice samples too long (max 5 minutes)" | |
| else: | |
| return f"Custom voice model ready!\n({total_duration:.1f}s of training data)" | |
| except Exception as e: | |
| return f"Error processing voice samples: {e}" | |
| # Initialize the AI Cover Generator | |
| cover_generator = AICoverGenerator() | |
| def generate_cover( | |
| audio_file, | |
| voice_model: str, | |
| pitch_shift: int = 0, | |
| voice_strength: float = 80, | |
| auto_tune: bool = False, | |
| output_format: str = "wav" | |
| ) -> Tuple[Optional[str], str]: | |
| """Main \ | |
| function to generate AI cover""" | |
| if audio_file is None: | |
| return None, "Please upload an audio file" | |
| try: | |
| # Step 1: Separate vocals and instrumentals | |
| yield None, "🎵 Separating vocals and instrumentals..." | |
| vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name) | |
| if vocals_path is None: | |
| return None, "❌ Failed to separate vocals" | |
| # Step 2: Convert vocals to target voice | |
| yield None, f"🎤 Converting vocals to {voice_model} style..." | |
| converted_vocals_path = cover_generator.convert_voice( | |
| vocals_path, | |
| voice_model, | |
| pitch_shift, | |
| voice_strength / 100 | |
| ) | |
| # Step 3: Apply auto-tune if requested | |
| if auto_tune: | |
| yield None, "🎼 Applying auto-tune..." | |
| # Auto-tune implementation would go here | |
| pass | |
| # Step 4: Mix final audio | |
| yield None, "🎧 Mixing final audio..." | |
| final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path) | |
| if final_path is None: | |
| return None, "❌ Failed to mix audio" | |
| # Convert to requested \ | |
| format if needed | |
| if output_format != "wav": | |
| yield None, f"💾 Converting to {output_format.upper()}..." | |
| # Format conversion would go here | |
| return final_path, "✅ AI Cover generated successfully!" | |
| except Exception as e: | |
| return None, f"❌ Error: {str(e)}" | |
| def process_voice_samples(voice_files) -> str: | |
| """Process uploaded voice samples for custom voice training""" | |
| if not voice_files: | |
| return "No voice samples uploaded" | |
| return cover_generator.process_custom_voice(voice_files) | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="🎵 AI Cover Song Platform", | |
| # Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0 (as per requirements.txt change) | |
| css=""" | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| } | |
| .main-header { | |
| text-align: center; | |
| padding: 2rem; | |
| background: rgba(255, 255, 255, 0.1); | |
| backdrop-filter: blur(10px); | |
| border-radius: 20px; | |
| margin: 1rem; | |
| } | |
| .step-container { | |
| background: rgba(255, 255, 255, 0.05); | |
| backdrop-filter: blur(10px); | |
| border-radius: 15px; | |
| padding: 1.5rem; | |
| margin: 1rem 0; | |
| border: 1px solid rgba(255, 255, 255, 0.1); | |
| } | |
| """ | |
| ) as app: | |
| # Header | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| <div class="main-header"> | |
| <h1 style="font-size: 3rem; margin-bottom: 1rem;">🎵 AI Cover Song Platform</h1> | |
| <p style="font-size: 1.2rem; opacity: 0.9;">Transform any song with AI voice synthesis</p> | |
| <div style="margin-top: 1rem;"> | |
| <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; margin: 0 0.5rem;">🎵 Voice Separation</span> | |
| <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; margin: 0 0.5rem;">🎤 Voice Cloning</span> | |
| <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; margin: 0 0.5rem;">🎧 High Quality Audio</span> | |
| </div> | |
| </div> | |
| """) | |
| # Step 1: Upload Audio | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## 🎵 Step 1: Upload Your Song") | |
| audio_input = gr.Audio( | |
| label="Upload Audio File", | |
| type="filepath", | |
| format="wav" | |
| ) | |
| gr.Markdown("*Supports MP3, WAV, FLAC files*") | |
| # Step 2: Voice Selection | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## 🎤 Step 2: Choose Voice Model") | |
| voice_model = gr.Dropdown( | |
| choices=list(cover_generator.voice_models.values()), | |
| label="Voice Model", | |
| value="Drake Style Voice", | |
| interactive=True | |
| ) | |
| # Custom voice training section | |
| with gr.Accordion("🎙️ Train Custom Voice (Optional)", open=False): | |
| voice_samples = gr.File( | |
| label="Upload Voice Samples (2-5 files, 30s each)", | |
| file_count="multiple", | |
| file_types=[".wav", ".mp3"] | |
| ) | |
| train_btn = gr.Button("Train Custom Voice", variant="secondary") | |
| training_status = gr.Textbox(label="Training Status", interactive=False) | |
| train_btn.click( | |
| process_voice_samples, | |
| inputs=[voice_samples], | |
| outputs=[training_status] | |
| ) | |
| # Step 3: Audio Settings | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## ⚙️ Step 3: Audio Settings") | |
| with gr.Row(): | |
| pitch_shift = gr.Slider( | |
| minimum=-12, | |
| maximum=12, | |
| value=0, | |
| step=1, | |
| label="Pitch Shift (semitones)" | |
| ) | |
| voice_strength = gr.Slider( | |
| minimum=0, | |
| maximum=100, | |
| value=80, | |
| step=5, | |
| label="Voice Strength (%)" | |
| ) | |
| with gr.Row(): | |
| auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False) | |
| output_format = gr.Dropdown( | |
| choices=["wav", "mp3", "flac"], | |
| label="Output Format", | |
| value="wav" | |
| ) | |
| # Step 4: Generate Cover | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## 🎧 Step 4: Generate Cover") | |
| generate_btn = gr.Button( | |
| "🎵 Generate AI Cover", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| progress_text = gr.Textbox( | |
| label="Progress", | |
| value="Ready to generate cover...", | |
| interactive=False | |
| ) | |
| # Results | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## 🎉 Results") | |
| with gr.Row(): | |
| original_audio = gr.Audio(label="Original Song", interactive=False) | |
| cover_audio = gr.Audio(label="AI Cover", interactive=False) | |
| # Legal Notice | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| <div style="background: rgba(255, 193, 7, 0.1); | |
| border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem; | |
| margin: 1rem 0;"> | |
| <h3>⚠️ Legal & Ethical Notice</h3> | |
| <p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly. | |
| Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content. | |
| Respect copyright laws and artist rights.</p> | |
| </div> | |
| """) | |
| # Event handlers | |
| generate_btn.click( | |
| generate_cover, | |
| inputs=[ | |
| audio_input, | |
| voice_model, | |
| pitch_shift, | |
| voice_strength, | |
| auto_tune, | |
| output_format | |
| ], | |
| outputs=[cover_audio, progress_text] | |
| ) | |
| # Update original audio when file is uploaded | |
| audio_input.change( | |
| lambda x: x, | |
| inputs=[audio_input], | |
| outputs=[original_audio] | |
| ) | |
| return app | |
| # Launch the app | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True, | |
| show_error=True | |
| ) | |