| import argparse | |
| import os | |
| import numpy as np | |
| import librosa | |
| import torch | |
| from tqdm import tqdm | |
| from transformers import Wav2Vec2FeatureExtractor, HubertModel | |
| def main(args): | |
| if not torch.cuda.is_available() and args.computed_device == 'cuda': | |
| print('CUDA is not available on this device. Switching to CPU.') | |
| args.computed_device = 'cpu' | |
| device = torch.device(args.computed_device) | |
| model = HubertModel.from_pretrained(args.model_path).to(device) | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_path) | |
| model.feature_extractor._freeze_parameters() | |
| model.eval() | |
| os.makedirs(args.audio_feature_saved_path, exist_ok=True) | |
| for wavfile in tqdm(os.listdir(args.audio_dir_path)): | |
| npy_save_path = os.path.join(args.audio_feature_saved_path, os.path.splitext(os.path.basename(wavfile))[0] + '.npy') | |
| if os.path.exists(npy_save_path): | |
| continue | |
| audio, sr = librosa.load(os.path.join(args.audio_dir_path, wavfile), sr=16000) | |
| input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values | |
| input_values = input_values.to(device) | |
| ws_feats = [] | |
| with torch.no_grad(): | |
| outputs = model(input_values, output_hidden_states=True) | |
| for i in range(len(outputs.hidden_states)): | |
| ws_feats.append(outputs.hidden_states[i].detach().cpu().numpy()) | |
| ws_feat_obj = np.array(ws_feats) | |
| ws_feat_obj = np.squeeze(ws_feat_obj, 1) | |
| if args.padding_to_align_audio: | |
| ws_feat_obj = np.pad(ws_feat_obj, ((0, 0), (0, 1), (0, 0)), 'edge') | |
| np.save(npy_save_path, ws_feat_obj) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Extract audio features using a pre-trained HuBERT model.") | |
| parser.add_argument("--model_path", type=str, default='weights/chinese-hubert-large', help="Path to the pre-trained model weights.") | |
| parser.add_argument("--audio_dir_path", type=str, default='./audio_samples/raw_audios/', help="Directory containing raw audio files.") | |
| parser.add_argument("--audio_feature_saved_path", type=str, default='./audio_samples/audio_features/', help="Directory where extracted audio features will be saved.") | |
| parser.add_argument("--computed_device", type=str, default='cuda', choices=['cuda', 'cpu'], help="Device to compute the audio features on. Use 'cuda' for GPU or 'cpu' for CPU.") | |
| parser.add_argument("--padding_to_align_audio", type=bool, default=True, help="Whether to pad the audio to align features.") | |
| args = parser.parse_args() | |
| main(args) |