| | import numpy as np |
| | import torch |
| | import glob |
| | import os |
| | import tqdm |
| | import librosa |
| | import parselmouth |
| | from utils.commons.pitch_utils import f0_to_coarse |
| | from utils.commons.multiprocess_utils import multiprocess_run_tqdm |
| | from utils.commons.os_utils import multiprocess_glob |
| | from utils.audio.io import save_wav |
| |
|
| | from moviepy.editor import VideoFileClip |
| | from utils.commons.hparams import hparams, set_hparams |
| |
|
| | def resample_wav(wav_name, out_name, sr=16000): |
| | wav_raw, sr = librosa.core.load(wav_name, sr=sr) |
| | save_wav(wav_raw, out_name, sr) |
| | |
| | def split_wav(mp4_name, wav_name=None): |
| | if wav_name is None: |
| | wav_name = mp4_name.replace(".mp4", ".wav").replace("/video/", "/audio/") |
| | if os.path.exists(wav_name): |
| | return wav_name |
| | os.makedirs(os.path.dirname(wav_name), exist_ok=True) |
| | |
| | video = VideoFileClip(mp4_name,verbose=False) |
| | dur = video.duration |
| | audio = video.audio |
| | assert audio is not None |
| | audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None) |
| | return wav_name |
| |
|
| | def librosa_pad_lr(x, fsize, fshift, pad_sides=1): |
| | '''compute right padding (final frame) or both sides padding (first and final frames) |
| | ''' |
| | assert pad_sides in (1, 2) |
| | |
| | pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] |
| | if pad_sides == 1: |
| | return 0, pad |
| | else: |
| | return pad // 2, pad // 2 + pad % 2 |
| |
|
| | def extract_mel_from_fname(wav_path, |
| | fft_size=512, |
| | hop_size=320, |
| | win_length=512, |
| | window="hann", |
| | num_mels=80, |
| | fmin=80, |
| | fmax=7600, |
| | eps=1e-6, |
| | sample_rate=16000, |
| | min_level_db=-100): |
| | if isinstance(wav_path, str): |
| | wav, _ = librosa.core.load(wav_path, sr=sample_rate) |
| | else: |
| | wav = wav_path |
| |
|
| | |
| | x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size, |
| | win_length=win_length, window=window, center=False) |
| | spc = np.abs(x_stft) |
| |
|
| | |
| | fmin = 0 if fmin == -1 else fmin |
| | fmax = sample_rate / 2 if fmax == -1 else fmax |
| | mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax) |
| | mel = mel_basis @ spc |
| |
|
| | mel = np.log10(np.maximum(eps, mel)) |
| | mel = mel.T |
| |
|
| | l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1) |
| | wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0) |
| |
|
| | return wav.T, mel |
| |
|
| | def extract_f0_from_wav_and_mel(wav, mel, |
| | hop_size=320, |
| | audio_sample_rate=16000, |
| | ): |
| | time_step = hop_size / audio_sample_rate * 1000 |
| | f0_min = 80 |
| | f0_max = 750 |
| | f0 = parselmouth.Sound(wav, audio_sample_rate).to_pitch_ac( |
| | time_step=time_step / 1000, voicing_threshold=0.6, |
| | pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] |
| |
|
| | delta_l = len(mel) - len(f0) |
| | assert np.abs(delta_l) <= 8 |
| | if delta_l > 0: |
| | f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0) |
| | f0 = f0[:len(mel)] |
| | pitch_coarse = f0_to_coarse(f0) |
| | return f0, pitch_coarse |
| |
|
| |
|
| | def extract_mel_f0_from_fname(wav_name=None, out_name=None): |
| | try: |
| | out_name = wav_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/") |
| | os.makedirs(os.path.dirname(out_name), exist_ok=True) |
| |
|
| | wav, mel = extract_mel_from_fname(wav_name) |
| | f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel) |
| | out_dict = { |
| | "mel": mel, |
| | "f0": f0, |
| | } |
| | np.save(out_name, out_dict) |
| | except Exception as e: |
| | print(e) |
| |
|
| | def extract_mel_f0_from_video_name(mp4_name, wav_name=None, out_name=None): |
| | if mp4_name.endswith(".mp4"): |
| | wav_name = split_wav(mp4_name, wav_name) |
| | if out_name is None: |
| | out_name = mp4_name.replace(".mp4", "_mel_f0.npy").replace("/video/", "/mel_f0/") |
| | elif mp4_name.endswith(".wav"): |
| | wav_name = mp4_name |
| | if out_name is None: |
| | out_name = mp4_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/") |
| |
|
| | os.makedirs(os.path.dirname(out_name), exist_ok=True) |
| |
|
| | wav, mel = extract_mel_from_fname(wav_name) |
| |
|
| | f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel) |
| | out_dict = { |
| | "mel": mel, |
| | "f0": f0, |
| | } |
| | np.save(out_name, out_dict) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | from argparse import ArgumentParser |
| | parser = ArgumentParser() |
| | parser.add_argument('--video_id', type=str, default='May', help='') |
| | args = parser.parse_args() |
| | |
| | person_id = args.video_id |
| |
|
| | wav_16k_name = f"data/processed/videos/{person_id}/aud.wav" |
| | out_name = f"data/processed/videos/{person_id}/aud_mel_f0.npy" |
| | extract_mel_f0_from_video_name(wav_16k_name, out_name) |
| | print(f"Saved at {out_name}") |