|
|
import math |
|
|
import os |
|
|
|
|
|
import librosa |
|
|
import numpy as np |
|
|
import torch |
|
|
from einops import rearrange |
|
|
from transformers import AutoFeatureExtractor |
|
|
|
|
|
|
|
|
class AudioProcessor: |
|
|
def __init__(self, feature_extractor_path="openai/whisper-tiny/"): |
|
|
self.feature_extractor = AutoFeatureExtractor.from_pretrained(feature_extractor_path) |
|
|
|
|
|
def get_audio_feature(self, wav_path, start_index=0, weight_dtype=None): |
|
|
if not os.path.exists(wav_path): |
|
|
return None |
|
|
librosa_output, sampling_rate = librosa.load(wav_path, sr=16000) |
|
|
assert sampling_rate == 16000 |
|
|
|
|
|
segment_length = 30 * sampling_rate |
|
|
segments = [librosa_output[i:i + segment_length] for i in range(0, len(librosa_output), segment_length)] |
|
|
|
|
|
features = [] |
|
|
for segment in segments: |
|
|
audio_feature = self.feature_extractor( |
|
|
segment, |
|
|
return_tensors="pt", |
|
|
sampling_rate=sampling_rate |
|
|
).input_features |
|
|
if weight_dtype is not None: |
|
|
audio_feature = audio_feature.to(dtype=weight_dtype) |
|
|
features.append(audio_feature) |
|
|
|
|
|
return features, len(librosa_output) |
|
|
|
|
|
def get_whisper_chunk( |
|
|
self, |
|
|
whisper_input_features, |
|
|
device, |
|
|
weight_dtype, |
|
|
whisper, |
|
|
librosa_length, |
|
|
fps=25, |
|
|
audio_padding_length_left=2, |
|
|
audio_padding_length_right=2, |
|
|
): |
|
|
audio_feature_length_per_frame = 2 * (audio_padding_length_left + audio_padding_length_right + 1) |
|
|
whisper_feature = [] |
|
|
|
|
|
for input_feature in whisper_input_features: |
|
|
input_feature = input_feature.to(device).to(weight_dtype) |
|
|
audio_feats = whisper.encoder(input_feature, output_hidden_states=True).hidden_states |
|
|
audio_feats = torch.stack(audio_feats, dim=2) |
|
|
whisper_feature.append(audio_feats) |
|
|
|
|
|
whisper_feature = torch.cat(whisper_feature, dim=1) |
|
|
|
|
|
sr = 16000 |
|
|
audio_fps = 50 |
|
|
fps = int(fps) |
|
|
whisper_idx_multiplier = audio_fps / fps |
|
|
num_frames = math.floor((librosa_length / sr) * fps) |
|
|
actual_length = math.floor((librosa_length / sr) * audio_fps) |
|
|
whisper_feature = whisper_feature[:,:actual_length,...] |
|
|
|
|
|
|
|
|
padding_nums = math.ceil(whisper_idx_multiplier) |
|
|
|
|
|
whisper_feature = torch.cat([ |
|
|
torch.zeros_like(whisper_feature[:, :padding_nums * audio_padding_length_left]), |
|
|
whisper_feature, |
|
|
|
|
|
torch.zeros_like(whisper_feature[:, :padding_nums * 3 * audio_padding_length_right]) |
|
|
], 1) |
|
|
|
|
|
audio_prompts = [] |
|
|
for frame_index in range(num_frames): |
|
|
try: |
|
|
audio_index = math.floor(frame_index * whisper_idx_multiplier) |
|
|
audio_clip = whisper_feature[:, audio_index: audio_index + audio_feature_length_per_frame] |
|
|
assert audio_clip.shape[1] == audio_feature_length_per_frame |
|
|
audio_prompts.append(audio_clip) |
|
|
except Exception as e: |
|
|
print(f"Error occurred: {e}") |
|
|
print(f"whisper_feature.shape: {whisper_feature.shape}") |
|
|
print(f"audio_clip.shape: {audio_clip.shape}") |
|
|
print(f"num frames: {num_frames}, fps: {fps}, whisper_idx_multiplier: {whisper_idx_multiplier}") |
|
|
print(f"frame_index: {frame_index}, audio_index: {audio_index}-{audio_index + audio_feature_length_per_frame}") |
|
|
exit() |
|
|
|
|
|
audio_prompts = torch.cat(audio_prompts, dim=0) |
|
|
audio_prompts = rearrange(audio_prompts, 'b c h w -> b (c h) w') |
|
|
return audio_prompts |
|
|
|
|
|
if __name__ == "__main__": |
|
|
audio_processor = AudioProcessor() |
|
|
wav_path = "./2.wav" |
|
|
audio_feature, librosa_feature_length = audio_processor.get_audio_feature(wav_path) |
|
|
print("Audio Feature shape:", audio_feature.shape) |
|
|
print("librosa_feature_length:", librosa_feature_length) |
|
|
|
|
|
|