Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import soundfile as sf | |
| # import torch | |
| from moviepy import AudioFileClip, VideoFileClip | |
| from pydub import AudioSegment | |
| from pytubefix import YouTube | |
| from pytubefix.cli import on_progress | |
| # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| from source_separation import Predictor | |
| def token_verifier(): | |
| visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D" | |
| po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA==" | |
| return (visitor_data, po_token) | |
| def download_from_youtube(url, folder_path): | |
| yt = YouTube( | |
| url, | |
| on_progress_callback=on_progress, | |
| use_po_token=True, | |
| po_token_verifier=token_verifier, | |
| ) | |
| print(yt.title) | |
| ys = yt.streams.get_highest_resolution() | |
| ys.download(output_path=folder_path, filename="temp.mp4") | |
| def separate_video_and_audio(video_path, audio_path): | |
| # Load the video clip | |
| video_clip = VideoFileClip(video_path) | |
| # Extract the audio from the video clip | |
| audio_clip = video_clip.audio | |
| # Write the audio to a separate file | |
| audio_clip.write_audiofile(audio_path) | |
| def load_audio(audio_path, sample_rate=44_100): | |
| audio = AudioSegment.from_file(audio_path) | |
| print("Entering the preprocessing of audio") | |
| # Convert the audio file to WAV format | |
| audio = audio.set_frame_rate(sample_rate) | |
| audio = audio.set_sample_width(2) # Set bit depth to 16bit | |
| audio = audio.set_channels(1) # Set to mono | |
| print("Audio file converted to WAV format") | |
| # Calculate the gain to be applied | |
| target_dBFS = -20 | |
| gain = target_dBFS - audio.dBFS | |
| print(f"Calculating the gain needed for the audio: {gain} dB") | |
| # Normalize volume and limit gain range to between -3 and 3 | |
| normalized_audio = audio.apply_gain(min(max(gain, -3), 3)) | |
| waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32) | |
| max_amplitude = np.max(np.abs(waveform)) | |
| waveform /= max_amplitude # Normalize | |
| print(f"waveform shape: {waveform.shape}") | |
| print("waveform in np ndarray, dtype=" + str(waveform.dtype)) | |
| return waveform, sample_rate | |
| args = { | |
| "model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx", | |
| "denoise": True, | |
| "margin": 44100, | |
| "chunks": 15, | |
| "n_fft": 6144, | |
| "dim_t": 8, | |
| "dim_f": 3072, | |
| } | |
| separate_predictor = Predictor(args=args, device="cpu") | |
| def source_separation(waveform): | |
| """ | |
| Separate the audio into vocals and non-vocals using the given predictor. | |
| Args: | |
| predictor: The separation model predictor. | |
| audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate. | |
| Returns | |
| ------- | |
| dict: A dictionary containing the separated vocals and updated audio waveform. | |
| """ | |
| vocals, no_vocals = separate_predictor.predict(waveform) | |
| vocals = vocals[:, 0] # vocals is stereo, only use one channel | |
| no_vocals = no_vocals[:, 0] # no_vocals is stereo, only use one channel | |
| return vocals, no_vocals | |
| def export_to_wav(vocals, no_vocals, sample_rate, folder_path): | |
| """Export segmented audio to WAV files.""" | |
| sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate) | |
| sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate) | |
| def combine_video_and_audio(video_path, no_vocals_path, output_path): | |
| my_clip = VideoFileClip(video_path, audio=False) | |
| audio_background = AudioFileClip(no_vocals_path) | |
| my_clip.audio = audio_background | |
| my_clip.write_videofile(output_path) | |
| # https://www.youtube.com/watch?v=1jZEyU_eO1s | |
| def get_karaoke(url): | |
| folder_path = "data/samples/" | |
| video_path = folder_path + "temp.mp4" | |
| audio_path = folder_path + "temp.mp3" | |
| no_vocals_path = folder_path + "temp_no_vocals.wav" | |
| output_path = folder_path + "result.mp4" | |
| download_from_youtube(url, folder_path) | |
| separate_video_and_audio(video_path, audio_path) | |
| waveform, sample_rate = load_audio(audio_path) | |
| vocals, no_vocals = source_separation(waveform) | |
| export_to_wav(vocals, no_vocals, sample_rate, folder_path) | |
| combine_video_and_audio(video_path, no_vocals_path, output_path) | |
| return output_path | |