File size: 2,604 Bytes
f1106d1 ac63a19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | import os.path
import pandas as pd
from towhee import pipe, ops
import torch
from configs import args
import torchaudio
import tempfile
def preprocess_audio_to_mono(input_path, target_sr=16000, keep_original_format=True):
waveform, sample_rate = torchaudio.load(input_path)
original_encoding = torchaudio.info(input_path).encoding
if waveform.shape[0] > 1:
waveform = waveform[:1, :]
temp_fd, temp_path = tempfile.mkstemp(suffix='.wav')
os.close(temp_fd)
if keep_original_format and original_encoding == "PCM_S":
waveform = (waveform * 32767).to(torch.short) # float -> int16
torchaudio.save(temp_path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
else:
torchaudio.save(temp_path, waveform, sample_rate)
return temp_path
audio_vggish_pipeline = ( # pipeline building
pipe.input('path')
.map('path', 'frame', ops.audio_decode.ffmpeg())
.map('frame', 'vecs', ops.audio_embedding.vggish())
.output('vecs')
)
data_dir = args.data_dir
# test_id = 'zxis5LLvULw_12000_22000'
# test_path = f'{data_dir}/media/{test_id}/audio.wav'
# temp_path = preprocess_audio_to_mono(test_path)
# print(f"original audio info: {torchaudio.info(test_path)}")
# print(f"mono audio info: :{torchaudio.info(temp_path)}")
# test_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0])
# print(test_embed.shape)
# os.unlink(temp_path)
#
#
# test_id = 'null_c-45AfEdAU050_99000_109000'
# test_path = f'{data_dir}/media/{test_id}/audio.wav'
# temp_path = preprocess_audio_to_mono(test_path)
# print(f"original audio info: {torchaudio.info(test_path)}")
# print(f"mono audio info: :{torchaudio.info(temp_path)}")
# test_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0])
# print(test_embed.shape)
# os.unlink(temp_path)
metapath = os.path.join(data_dir, 'metadata.csv')
metadata = pd.read_csv(metapath, header=0)
metadata = metadata[metadata['split'].isin(['train', 'val', 'test_s', 'test_u', 'test_n'])]
# metadata = metadata[metadata['split'].isin(['test_s'])]
vids = metadata['uid'].apply(lambda x: x.rsplit('_', 2)[0]).unique()
save_dir = os.path.join(data_dir, 'audio_embed')
os.makedirs(save_dir, exist_ok=True)
for vid in vids:
audio_path = f'{data_dir}/media/{vid}/audio.wav'
temp_path = preprocess_audio_to_mono(audio_path)
audio_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0])
os.unlink(temp_path)
# print(f"{vid}: {audio_embed.shape}")
torch.save(audio_embed, f'{save_dir}/{vid}.pt')
print(f'{vid} embedding saved {audio_embed.shape}')
|