GestureLSM / dataloaders /utils /audio_features.py
Tharun156's picture
Upload 149 files
f7400bf verified
"""modified from https://github.com/yesheng-THU/GFGE/blob/main/data_processing/audio_features.py"""
import numpy as np
import librosa
import math
import os
import scipy.io.wavfile as wav
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
from tqdm import tqdm
from typing import Optional, Tuple
from numpy.lib import stride_tricks
from loguru import logger
# Import Wav2Vec2Model to make it available for other modules
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from models.utils.wav2vec import Wav2Vec2Model
def process_audio_data(audio_file, args, data, f_name, selected_file):
"""Process audio data with support for different representations."""
logger.info(f"# ---- Building cache for Audio {f_name} ---- #")
if not os.path.exists(audio_file):
logger.warning(f"# ---- file not found for Audio {f_name}, skip all files with the same id ---- #")
selected_file.drop(selected_file[selected_file['id'] == f_name].index, inplace=True)
return None
audio_save_path = audio_file.replace("wave16k", "onset_amplitude").replace(".wav", ".npy")
if args.audio_rep == "onset+amplitude" and os.path.exists(audio_save_path):
data['audio'] = np.load(audio_save_path)
logger.warning(f"# ---- file found cache for Audio {f_name} ---- #")
elif args.audio_rep == "onset+amplitude":
data['audio'] = calculate_onset_amplitude(audio_file, args.audio_sr, audio_save_path)
elif args.audio_rep == "mfcc":
audio_data, _ = librosa.load(audio_file)
data['audio'] = librosa.feature.melspectrogram(
y=audio_data,
sr=args.audio_sr,
n_mels=128,
hop_length=int(args.audio_sr/args.audio_fps)
).transpose(1, 0)
if args.audio_norm and args.audio_rep == "wave16k":
data['audio'] = (data['audio'] - args.mean_audio) / args.std_audio
return data
def calculate_onset_amplitude(audio_file, audio_sr, save_path):
"""Calculate onset and amplitude features from audio file."""
audio_data, sr = librosa.load(audio_file)
audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=audio_sr)
# Calculate amplitude envelope
frame_length = 1024
shape = (audio_data.shape[-1] - frame_length + 1, frame_length)
strides = (audio_data.strides[-1], audio_data.strides[-1])
rolling_view = stride_tricks.as_strided(audio_data, shape=shape, strides=strides)
amplitude_envelope = np.max(np.abs(rolling_view), axis=1)
amplitude_envelope = np.pad(amplitude_envelope, (0, frame_length-1), mode='constant', constant_values=amplitude_envelope[-1])
# Calculate onset
audio_onset_f = librosa.onset.onset_detect(y=audio_data, sr=audio_sr, units='frames')
onset_array = np.zeros(len(audio_data), dtype=float)
onset_array[audio_onset_f] = 1.0
# Combine features
features = np.concatenate([amplitude_envelope.reshape(-1, 1), onset_array.reshape(-1, 1)], axis=1)
# Save features
os.makedirs(os.path.dirname(save_path), exist_ok=True)
np.save(save_path, features)
return features