| import os.path |
|
|
| import pandas as pd |
| from towhee import pipe, ops |
| import torch |
| from configs import args |
| import torchaudio |
| import tempfile |
|
|
|
|
| def preprocess_audio_to_mono(input_path, target_sr=16000, keep_original_format=True): |
|
|
| waveform, sample_rate = torchaudio.load(input_path) |
| original_encoding = torchaudio.info(input_path).encoding |
|
|
|
|
| if waveform.shape[0] > 1: |
| waveform = waveform[:1, :] |
|
|
| temp_fd, temp_path = tempfile.mkstemp(suffix='.wav') |
| os.close(temp_fd) |
|
|
| if keep_original_format and original_encoding == "PCM_S": |
| waveform = (waveform * 32767).to(torch.short) |
| torchaudio.save(temp_path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16) |
| else: |
| torchaudio.save(temp_path, waveform, sample_rate) |
|
|
| return temp_path |
|
|
|
|
| audio_vggish_pipeline = ( |
| pipe.input('path') |
| .map('path', 'frame', ops.audio_decode.ffmpeg()) |
| .map('frame', 'vecs', ops.audio_embedding.vggish()) |
| .output('vecs') |
| ) |
|
|
|
|
|
|
| data_dir = args.data_dir |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
| metapath = os.path.join(data_dir, 'metadata.csv') |
| metadata = pd.read_csv(metapath, header=0) |
| metadata = metadata[metadata['split'].isin(['train', 'val', 'test_s', 'test_u', 'test_n'])] |
| |
|
|
| vids = metadata['uid'].apply(lambda x: x.rsplit('_', 2)[0]).unique() |
|
|
| save_dir = os.path.join(data_dir, 'audio_embed') |
| os.makedirs(save_dir, exist_ok=True) |
|
|
| for vid in vids: |
| audio_path = f'{data_dir}/media/{vid}/audio.wav' |
| temp_path = preprocess_audio_to_mono(audio_path) |
| audio_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0]) |
| os.unlink(temp_path) |
| |
| torch.save(audio_embed, f'{save_dir}/{vid}.pt') |
| print(f'{vid} embedding saved {audio_embed.shape}') |
|
|
|
|