Spaces:

WeixuanYuan
/

DiffuSynth

Paused

App Files Files Community

WeixuanYuan commited on Sep 12, 2024

Commit

1ecb721

verified ·

1 Parent(s): cf4423e

Upload 8 files

Browse files

Files changed (8) hide show

metrics/FD.py +293 -0
metrics/IS.py +218 -0
metrics/P_C_T.py +12 -0
metrics/get_reference_AST_features.py +63 -0
metrics/pipelines.py +144 -0
metrics/pipelines_STFT.py +100 -0
metrics/precision_recall.py +204 -0
metrics/visualizations.py +123 -0

metrics/FD.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import json
+import os
+import librosa
+import numpy as np
+import torch
+from tqdm import tqdm
+from scipy.linalg import sqrtm
+from metrics.pipelines import sample_pipeline, sample_pipeline_GAN
+from metrics.pipelines_STFT import sample_pipeline_STFT, sample_pipeline_GAN_STFT
+from tools import rms_normalize
+def ASTaudio2feature(device, signal, processor, AST, sampling_rate):
+    # audio file is decoded on the fly
+    inputs = processor(signal, sampling_rate=sampling_rate, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = AST(**inputs)
+    last_hidden_states = outputs.last_hidden_state[:, 0, :].to("cpu").detach().numpy()
+    return last_hidden_states
+# 计算两个numpy数组的均值和协方差矩阵
+def calculate_statistics(features):
+    mu = np.mean(features, axis=0)
+    sigma = np.cov(features, rowvar=False)
+    return mu, sigma
+# 计算FID
+def calculate_fid(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    # 在协方差矩阵对角线上添加一个小的正值
+    sigma1 += np.eye(sigma1.shape[0]) * eps
+    sigma2 += np.eye(sigma2.shape[0]) * eps
+    ssdiff = np.sum((mu1 - mu2) ** 2.0)
+    covmean = sqrtm(sigma1.dot(sigma2))
+    # 由于数值问题，有时可能会得到复数，只取实部
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return fid
+# 计算FID
+def calculate_fid_dict(dict1, dict2, eps=1e-6):
+    # 在协方差矩阵对角线上添加一个小的正值
+    mu1, sigma1 = dict1["mu"], dict1["sigma"]
+    mu2, sigma2 = dict2["mu"], dict2["sigma"]
+    sigma1 += np.eye(sigma1.shape[0]) * eps
+    sigma2 += np.eye(sigma2.shape[0]) * eps
+    ssdiff = np.sum((mu1 - mu2) ** 2.0)
+    covmean = sqrtm(sigma1.dot(sigma2))
+    # 由于数值问题，有时可能会得到复数，只取实部
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return fid
+# Todo: AudioLDM
+# def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False):
+#     diffuSynth_features = []
+#     # Step 1: Load all wav files in AudioLDM_signals_directory_path
+#     AudioLDM_signals = []
+#     signal_lengths = set()
+#     for file_name in os.listdir(AudioLDM_signals_directory_path):
+#         if file_name.endswith('.wav'):
+#             file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
+#             signal, sr = librosa.load(file_path, sr=16000)  # Load audio file with sampling rate 16000
+#             # Normalize
+#             AudioLDM_signals.append(rms_normalize(signal))
+#             signal_lengths.add(len(signal))
+#     # Step 2: Check if all signals have the same length
+#     if len(signal_lengths) != 1:
+#         raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")
+#     # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
+#     batch_size = 8
+#     signal_length = signal_lengths.pop()  # All lengths are the same, get one of them
+#     # Create batches
+#     signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)]
+#     for signal_batch in tqdm(signal_batches):
+#         features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000)
+#         diffuSynth_features.extend(features)
+#     if return_feature:
+#         return diffuSynth_features
+#     else:
+#         mu, sigma = calculate_statistics(diffuSynth_features)
+#         return {"mu": mu,  "sigma": sigma}
+def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False):
+    diffuSynth_features = []
+    # Step 1: Load all wav files in AudioLDM_signals_directory_path
+    AudioLDM_signals = []
+    signal_lengths = set()
+    target_length = 4 * 16000  # 4 seconds * 16000 samples per second
+    for file_name in os.listdir(AudioLDM_signals_directory_path):
+        if file_name.endswith('.wav') and not file_name.startswith('._'):
+            file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
+            try:
+                signal, sr = librosa.load(file_path, sr=16000)  # Load audio file with sampling rate 16000
+                if len(signal) >= target_length:
+                    signal = signal[:target_length]  # Take only the first 4 seconds
+                else:
+                    raise ValueError(f"The file {file_name} is shorter than 4 seconds.")
+                # Normalize
+                AudioLDM_signals.append(rms_normalize(signal))
+                signal_lengths.add(len(signal))
+            except Exception as e:
+                print(f"Error loading {file_name}: {e}")
+    # Step 2: Check if all signals have the same length
+    if len(signal_lengths) != 1:
+        raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")
+    # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
+    batch_size = 8
+    signal_length = signal_lengths.pop()  # All lengths are the same, get one of them
+    # Create batches
+    signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)]
+    for signal_batch in tqdm(signal_batches):
+        features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000)
+        diffuSynth_features.extend(features)
+    if return_feature:
+        return diffuSynth_features
+    else:
+        mu, sigma = calculate_statistics(diffuSynth_features)
+        return {"mu": mu, "sigma": sigma}
+def generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches,
+                                              positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False):
+    diffuSynth_features = []
+    if task == "spectrograms":
+        pipe = sample_pipeline
+    elif task == "STFT":
+        pipe = sample_pipeline_STFT
+    else:
+        raise NotImplementedError
+    for _ in tqdm(range(num_batches)):
+        quantized_latent_representations, reconstruction_batch, signals = pipe(device, uNet, VAE, mmm,
+                                                                                          CLAP_tokenizer,
+                                                                                          positive_prompts=positive_prompts,
+                                                                                          negative_prompts=negative_prompts,
+                                                                                          batchsize=8,
+                                                                                          sample_steps=sample_steps,
+                                                                                          CFG=CFG, seed=None,
+                                                                                          return_latent=False)
+        features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000)
+        diffuSynth_features.extend(features)
+    if return_feature:
+        return diffuSynth_features
+    else:
+        mu, sigma = calculate_statistics(diffuSynth_features)
+        return {"mu": mu,  "sigma": sigma}
+def generate_features_with_GAN_and_AST(device, gan_generator, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches,
+                                              positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False):
+    diffuSynth_features = []
+    if task == "spectrograms":
+        pipe = sample_pipeline_GAN
+    elif task == "STFT":
+        pipe = sample_pipeline_GAN_STFT
+    else:
+        raise NotImplementedError
+    for _ in tqdm(range(num_batches)):
+        quantized_latent_representations, reconstruction_batch, signals = pipe(device, gan_generator, VAE, mmm,
+                                                                                          CLAP_tokenizer,
+                                                                                          positive_prompts=positive_prompts,
+                                                                                          negative_prompts=negative_prompts,
+                                                                                          batchsize=8,
+                                                                                          sample_steps=sample_steps,
+                                                                                          CFG=CFG, seed=None,
+                                                                                          return_latent=False)
+        features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000)
+        diffuSynth_features.extend(features)
+    if return_feature:
+        return diffuSynth_features
+    else:
+        mu, sigma = calculate_statistics(diffuSynth_features)
+        return {"mu": mu,  "sigma": sigma}
+def get_FD(train_features, device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches, positive_prompts,
+           negative_prompts="", CFG=1, sample_steps=10):
+    diffuSynth_features = generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor,
+                                                                    AST, num_batches, positive_prompts,
+                                                                    negative_prompts=negative_prompts, CFG=CFG,
+                                                                    sample_steps=sample_steps)
+    mu_real, sigma_real = calculate_statistics(train_features)
+    mu_gen, sigma_gen = calculate_statistics(diffuSynth_features)
+    fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen)
+    print('FID score:', fid_score)
+def get_fid_score(feature1, features2):
+    mu_real, sigma_real = calculate_statistics(feature1)
+    mu_gen, sigma_gen = calculate_statistics(features2)
+    fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen)
+    # print('FID score:', fid_score)
+    return fid_score
+def calculate_fid_matrix(features_list_1, features_list_2, get_fid_score):
+    # 初始化一个矩阵来存储FID分数
+    # 矩阵的大小为 len(features_list_1) x len(features_list_2)
+    fid_scores = [[0 for _ in range(len(features_list_2))] for _ in range(len(features_list_1))]
+    # 遍历两个列表，并计算每一对特征集合的FID分数
+    for i, feature1 in enumerate(features_list_1):
+        for j, feature2 in enumerate(features_list_2):
+            fid_scores[i][j] = get_fid_score(feature1, feature2)
+    return fid_scores
+def save_AST_feature(key, mu, sigma, path='results/AST_metric/pre_calculated_features/AST_features.json'):
+    # 尝试打开并读取现有的JSON文件
+    try:
+        with open(path, 'r') as file:
+            data = json.load(file)
+    except FileNotFoundError:
+        # 如果文件不存在，创建一个新的字典
+        data = {}
+    if isinstance(mu, np.ndarray):
+        mu = mu.tolist()
+    if isinstance(sigma, np.ndarray):
+        sigma = sigma.tolist()
+    # 添加新数据
+    data[key] = {"mu": mu, "sigma": sigma}
+    # 将更新后的数据写回文件
+    with open(path, 'w') as file:
+        json.dump(data, file, indent=4)
+def read_AST_features(path='results/AST_metric/pre_calculated_features/AST_features.json'):
+    try:
+        # 尝试打开并读取JSON文件
+        with open(path, 'r') as file:
+            AST_features = json.load(file)
+            for AST_feature_name in AST_features.keys():
+                AST_features[AST_feature_name]["mu"] = np.array(AST_features[AST_feature_name]["mu"])
+                AST_features[AST_feature_name]["sigma"] = np.array(AST_features[AST_feature_name]["sigma"])
+            return AST_features
+    except FileNotFoundError:
+        # 如果文件不存在，返回一个空字典
+        print(f"文件 {path} 未找到.")
+        return {}
+    except json.JSONDecodeError:
+        # 如果文件不是有效的JSON，返回一个空字典
+        print(f"文件 {path} 不是有效的JSON格式.")
+        return {}

metrics/IS.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import os
+import librosa
+import numpy as np
+import torch
+from tqdm import tqdm
+from metrics.pipelines import sample_pipeline, inpaint_pipeline, sample_pipeline_GAN
+from metrics.pipelines_STFT import sample_pipeline_STFT, sample_pipeline_GAN_STFT
+from tools import rms_normalize, pad_STFT, encode_stft
+from webUI.natural_language_guided.utils import InputBatch2Encode_STFT
+def get_inception_score_for_AudioLDM(device, timbre_encoder, VAE, AudioLDM_signals_directory_path):
+    VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+    diffuSynth_probabilities = []
+    # Step 1: Load all wav files in AudioLDM_signals_directory_path
+    AudioLDM_signals = []
+    signal_lengths = set()
+    target_length = 4 * 16000  # 4 seconds * 16000 samples per second
+    for file_name in os.listdir(AudioLDM_signals_directory_path):
+        if file_name.endswith('.wav') and not file_name.startswith('._'):
+            file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
+            signal, sr = librosa.load(file_path, sr=16000)  # Load audio file with sampling rate 16000
+            if len(signal) >= target_length:
+                signal = signal[:target_length]  # Take only the first 4 seconds
+            else:
+                raise ValueError(f"The file {file_name} is shorter than 4 seconds.")
+            # Normalize
+            AudioLDM_signals.append(rms_normalize(signal))
+            signal_lengths.add(len(signal))
+    # Step 2: Check if all signals have the same length
+    if len(signal_lengths) != 1:
+        raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")
+    encoded_audios = []
+    for origin_audio in AudioLDM_signals:
+        D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
+        padded_D = pad_STFT(D)
+        encoded_D = encode_stft(padded_D)
+        encoded_audios.append(encoded_D)
+    encoded_audios_np = np.array(encoded_audios)
+    origin_spectrogram_batch_tensor = torch.from_numpy(encoded_audios_np).float().to(device)
+    # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
+    batch_size = 8
+    num_batches = int(np.ceil(origin_spectrogram_batch_tensor.shape[0] / batch_size))
+    spectrogram_batches = []
+    for i in range(num_batches):
+        batch = origin_spectrogram_batch_tensor[i * batch_size:(i + 1) * batch_size]
+        spectrogram_batches.append(batch)
+    for spectrogram_batch in tqdm(spectrogram_batches):
+        spectrogram_batch = spectrogram_batch.to(device)
+        _, _, _, _, quantized_latent_representations = InputBatch2Encode_STFT(VAE_encoder, spectrogram_batch, quantizer=VAE_quantizer, squared=False)
+        quantized_latent_representations = quantized_latent_representations
+        feature, instrument_logits, instrument_family_logits, velocity_logits, qualities = timbre_encoder(quantized_latent_representations)
+        probabilities = torch.nn.functional.softmax(instrument_logits, dim=1)
+        diffuSynth_probabilities.extend(probabilities.to("cpu").detach().numpy())
+    return inception_score(np.array(diffuSynth_probabilities))
+# def get_inception_score_for_AudioLDM(device, timbre_encoder, VAE, AudioLDM_signals_directory_path):
+#     VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+#
+#     diffuSynth_probabilities = []
+#
+#     # Step 1: Load all wav files in AudioLDM_signals_directory_path
+#     AudioLDM_signals = []
+#     signal_lengths = set()
+#
+#     for file_name in os.listdir(AudioLDM_signals_directory_path):
+#         if file_name.endswith('.wav'):
+#             file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
+#             signal, sr = librosa.load(file_path, sr=16000)  # Load audio file with sampling rate 16000
+#             # Normalize
+#             AudioLDM_signals.append(rms_normalize(signal))
+#             signal_lengths.add(len(signal))
+#
+#     # Step 2: Check if all signals have the same length
+#     if len(signal_lengths) != 1:
+#         raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")
+#
+#     encoded_audios = []
+#     for origin_audio in AudioLDM_signals:
+#         D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
+#         padded_D = pad_STFT(D)
+#         encoded_D = encode_stft(padded_D)
+#         encoded_audios.append(encoded_D)
+#     encoded_audios_np = np.array(encoded_audios)
+#     origin_spectrogram_batch_tensor = torch.from_numpy(encoded_audios_np).float().to(device)
+#
+#
+#     # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
+#     batch_size = 8
+#     num_batches = int(np.ceil(origin_spectrogram_batch_tensor.shape[0] / batch_size))
+#     spectrogram_batches = []
+#     for i in range(num_batches):
+#         batch = origin_spectrogram_batch_tensor[i * batch_size:(i + 1) * batch_size]
+#         spectrogram_batches.append(batch)
+#
+#
+#     for spectrogram_batch in tqdm(spectrogram_batches):
+#         spectrogram_batch = spectrogram_batch.to(device)
+#         _, _, _, _, quantized_latent_representations = InputBatch2Encode_STFT(VAE_encoder, spectrogram_batch, quantizer=VAE_quantizer,squared=False)
+#         quantized_latent_representations = quantized_latent_representations
+#         feature, instrument_logits, instrument_family_logits, velocity_logits, qualities = timbre_encoder(quantized_latent_representations)
+#         probabilities = torch.nn.functional.softmax(instrument_logits, dim=1)
+#
+#         diffuSynth_probabilities.extend(probabilities.to("cpu").detach().numpy())
+#
+#     return inception_score(np.array(diffuSynth_probabilities))
+def get_inception_score(device, uNet, VAE, MMM, CLAP_tokenizer, timbre_encoder, num_batches, positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms"):
+    diffuSynth_probabilities = []
+    if task == "spectrograms":
+        pipe = sample_pipeline
+    elif task == "STFT":
+        pipe = sample_pipeline_STFT
+    else:
+        raise NotImplementedError
+    for _ in tqdm(range(num_batches)):
+        quantized_latent_representations = pipe(device, uNet, VAE, MMM, CLAP_tokenizer,
+                                                           positive_prompts=positive_prompts, negative_prompts=negative_prompts,
+                                                      batchsize=8, sample_steps=sample_steps, CFG=CFG, seed=None)
+        quantized_latent_representations = quantized_latent_representations.to(device)
+        feature, instrument_logits, instrument_family_logits, velocity_logits, qualities = timbre_encoder(quantized_latent_representations)
+        probabilities = torch.nn.functional.softmax(instrument_logits, dim=1)
+        diffuSynth_probabilities.extend(probabilities.to("cpu").detach().numpy())
+    return inception_score(np.array(diffuSynth_probabilities))
+def get_inception_score_GAN(device, gan_generator, VAE, MMM, CLAP_tokenizer, timbre_encoder, num_batches, positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms"):
+    diffuSynth_probabilities = []
+    if task == "spectrograms":
+        pipe = sample_pipeline_GAN
+    elif task == "STFT":
+        pipe = sample_pipeline_GAN_STFT
+    else:
+        raise NotImplementedError
+    for _ in tqdm(range(num_batches)):
+        quantized_latent_representations = pipe(device, gan_generator, VAE, MMM, CLAP_tokenizer,
+                                                           positive_prompts=positive_prompts, negative_prompts=negative_prompts,
+                                                      batchsize=8, sample_steps=sample_steps, CFG=CFG, seed=None)
+        quantized_latent_representations = quantized_latent_representations.to(device)
+        feature, instrument_logits, instrument_family_logits, velocity_logits, qualities = timbre_encoder(quantized_latent_representations)
+        probabilities = torch.nn.functional.softmax(instrument_logits, dim=1)
+        diffuSynth_probabilities.extend(probabilities.to("cpu").detach().numpy())
+    return inception_score(np.array(diffuSynth_probabilities))
+def predict_qualities_with_diffuSynth_sample(device, uNet, VAE, MMM, CLAP_tokenizer, timbre_encoder, num_batches, positive_prompts, negative_prompts="", CFG=6, sample_steps=10):
+    diffuSynth_qualities = []
+    for _ in tqdm(range(num_batches)):
+        quantized_latent_representations = sample_pipeline(device, uNet, VAE, MMM, CLAP_tokenizer,
+                                                           positive_prompts=positive_prompts, negative_prompts=negative_prompts,
+                                                      batchsize=8, sample_steps=sample_steps, CFG=CFG, seed=None)
+        quantized_latent_representations = quantized_latent_representations.to(device)
+        feature, instrument_logits, instrument_family_logits, velocity_logits, qualities = timbre_encoder(quantized_latent_representations)
+        qualities = qualities.to("cpu").detach().numpy()
+        # qualities = np.where(qualities > 0.5, 1, 0)
+        diffuSynth_qualities.extend(qualities)
+    return np.mean(diffuSynth_qualities, axis=0)
+def generate_probabilities_with_diffuSynth_inpaint(device, uNet, VAE, MMM, CLAP_tokenizer, timbre_encoder, num_batches, guidance, duration, use_dynamic_mask, noising_strength, positive_prompts, negative_prompts="", CFG=6, sample_steps=10):
+    inpaint_probabilities, signals = [], []
+    for _ in tqdm(range(num_batches)):
+        quantized_latent_representations, _, rec_signals = inpaint_pipeline(device, uNet, VAE, MMM, CLAP_tokenizer,
+                                                                            use_dynamic_mask=use_dynamic_mask, noising_strength=noising_strength, guidance=guidance,
+                    positive_prompts=positive_prompts, negative_prompts=negative_prompts, batchsize=8, sample_steps=sample_steps, CFG=CFG, seed=None, duration=duration, mask_flexivity=0.999,
+                    return_latent=False)
+        quantized_latent_representations = quantized_latent_representations.to(device)
+        feature, instrument_logits, instrument_family_logits, velocity_logits, qualities = timbre_encoder(quantized_latent_representations)
+        probabilities = torch.nn.functional.softmax(instrument_logits, dim=1)
+        inpaint_probabilities.extend(probabilities.to("cpu").detach().numpy())
+        signals.extend(rec_signals)
+    return np.array(inpaint_probabilities), signals
+def inception_score(pred):
+    # 计算每个图像的条件概率分布 P(y|x)
+    pyx = pred / np.sum(pred, axis=1, keepdims=True)
+    # 计算整个数据集的边缘概率分布 P(y)
+    py = np.mean(pyx, axis=0, keepdims=True)
+    # 计算KL散度
+    kl_div = pyx * (np.log(pyx + 1e-11) - np.log(py + 1e-11))
+    # 对所有图像求和并平均
+    kl_div_sum = np.sum(kl_div, axis=1)
+    score = np.exp(np.mean(kl_div_sum))
+    return score

metrics/P_C_T.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+from metrics.precision_recall import knn_precision_recall_features
+# 生成样本
+real_features = np.random.normal(0, 1, size=(1600, 512))
+generated_features = np.random.normal(0, 1, size=(1600, 512))
+state = knn_precision_recall_features(real_features, generated_features, nhood_sizes=[1, 2, 3, 4, 5, 10],
+                                  row_batch_size=16, col_batch_size=16)
+print(state)

metrics/get_reference_AST_features.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import json
+import librosa
+import numpy as np
+from tqdm import tqdm
+from metrics.FD import ASTaudio2feature, calculate_statistics, save_AST_feature
+from tools import rms_normalize
+from transformers import AutoProcessor, ASTModel
+device = "cpu"
+processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+AST = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(device)
+data_split = "train"
+with open(f'data/NSynth/{data_split}_examples.json') as f:
+    data = json.load(f)
+def read_signal(note_str):
+    y, sr = librosa.load(f"data/NSynth/nsynth-{data_split}-52/audio/{note_str}.wav", sr=16000)
+    if len(y) >= 64000:
+        y = y[:64000]
+    else:
+        y_extend = [0.0] * 64000
+        y_extend[:len(y)] = y
+        y = y_extend
+    return rms_normalize(y)
+for quality in ["bright", "dark", "distortion", "fast_decay", "long_release", "multiphonic", "nonlinear_env", "percussive", "reverb", "tempo-synced"]:
+    features = []
+    for i, (note_str, attributes) in tqdm(enumerate(data.items())):
+        if not attributes["pitch"] == 52:
+            continue
+        if not (quality in attributes['qualities_str']):
+            continue
+        signal = read_signal(note_str)
+        feature_for_one_signal = ASTaudio2feature(device, [signal], processor, AST, sampling_rate=16000)[0]
+        features.append(feature_for_one_signal)
+    mu, sigma = calculate_statistics(features)
+    print(np.shape(mu))
+    print(np.shape(sigma))
+    save_AST_feature(f'{data_split}_{quality}', mu.tolist(), sigma.tolist())
+for instrument_name in ["bass", "brass", "flute", "guitar", "keyboard", "mallet", "organ", "reed", "string", "synth_lead", "vocal"]:
+    features = []
+    for i, (note_str, attributes) in tqdm(enumerate(data.items())):
+        if not attributes["pitch"] == 52:
+            continue
+        if not (attributes["instrument_family_str"] == instrument_name):
+            continue
+        signal = read_signal(note_str)
+        feature_for_one_signal = ASTaudio2feature(device, [signal], processor, AST, sampling_rate=16000)[0]
+        features.append(feature_for_one_signal)
+    mu, sigma = calculate_statistics(features)
+    print(np.shape(mu))
+    print(np.shape(sigma))
+    save_AST_feature(f'{data_split}_{instrument_name}', mu.tolist(), sigma.tolist())

metrics/pipelines.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import librosa
+import numpy as np
+import torch
+from tqdm import tqdm
+from tools import VAE_out_put_to_spc, rms_normalize, nnData2Audio
+from model.DiffSynthSampler import DiffSynthSampler
+def sample_pipeline(device, uNet, VAE, MMM, CLAP_tokenizer,
+                    positive_prompts, negative_prompts, batchsize, sample_steps, CFG, seed=None, duration=3.0,
+                    freq_resolution=512, time_resolution=256, channels=4, VAE_scale=4, timesteps=1000, noise_strategy="repeat", sampler="ddim", return_latent=True):
+    height = int(freq_resolution/VAE_scale)
+    width = int(time_resolution/VAE_scale)
+    VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+    text2sound_embedding = \
+        MMM.get_text_features(**CLAP_tokenizer([positive_prompts], padding=True, return_tensors="pt"))[0].to(device)
+    negative_condition = \
+        MMM.get_text_features(**CLAP_tokenizer([negative_prompts], padding=True, return_tensors="pt"))[0].to(device)
+    mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy, mute=True)
+    mySampler.activate_classifier_free_guidance(CFG, negative_condition)
+    mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
+    condition = text2sound_embedding.repeat(batchsize, 1)
+    latent_representations, initial_noise = \
+    mySampler.sample(model=uNet, shape=(batchsize, channels, height, width), seed=seed,
+                      return_tensor=True, condition=condition, sampler=sampler)
+    latent_representations = latent_representations[-1]
+    quantized_latent_representations, _, (_, _, _) = VAE_quantizer(latent_representations)
+    if return_latent:
+        return quantized_latent_representations.detach()
+    reconstruction_batch = VAE_decoder(quantized_latent_representations).to("cpu").detach().numpy()
+    time_resolution = int(time_resolution * ((duration+1) / 4))
+    rec_signals = nnData2Audio(reconstruction_batch, resolution=(freq_resolution, time_resolution))
+    rec_signals = [rms_normalize(rec_signal) for rec_signal in rec_signals]
+    return quantized_latent_representations.detach(), reconstruction_batch, rec_signals
+def sample_pipeline_GAN(device, gan_generator, VAE, MMM, CLAP_tokenizer,
+                    positive_prompts, negative_prompts, batchsize, sample_steps, CFG, seed=None, duration=3.0,
+                    freq_resolution=512, time_resolution=256, channels=4, VAE_scale=4, timesteps=1000, noise_strategy="repeat", sampler="ddim", return_latent=True):
+    height = int(freq_resolution/VAE_scale)
+    width = int(time_resolution/VAE_scale)
+    VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+    text2sound_embedding = \
+        MMM.get_text_features(**CLAP_tokenizer([positive_prompts], padding=True, return_tensors="pt"))[0].to(device)
+    condition = text2sound_embedding.repeat(batchsize, 1)
+    noise = torch.randn(batchsize, channels, height, width).to(device)
+    latent_representations = gan_generator(noise, condition)
+    quantized_latent_representations, _, (_, _, _) = VAE_quantizer(latent_representations)
+    if return_latent:
+        return quantized_latent_representations.detach()
+    reconstruction_batch = VAE_decoder(quantized_latent_representations).to("cpu").detach().numpy()
+    time_resolution = int(time_resolution * ((duration+1) / 4))
+    rec_signals = nnData2Audio(reconstruction_batch, resolution=(freq_resolution, time_resolution))
+    rec_signals = [rms_normalize(rec_signal) for rec_signal in rec_signals]
+    return quantized_latent_representations.detach(), reconstruction_batch, rec_signals
+def inpaint_pipeline(device, uNet, VAE, MMM, CLAP_tokenizer, use_dynamic_mask, noising_strength, guidance,
+                    positive_prompts, negative_prompts, batchsize, sample_steps, CFG, seed=None, duration=3.0, mask_flexivity=0.99,
+                    freq_resolution=512, time_resolution=256, channels=4, VAE_scale=4, timesteps=1000, noise_strategy="repeat", sampler="ddim", return_latent=True):
+    height = int(freq_resolution/VAE_scale)
+    width = int(time_resolution * ((duration + 1) / 4) / VAE_scale)
+    VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+    text2sound_embedding = \
+        MMM.get_text_features(**CLAP_tokenizer([positive_prompts], padding=True, return_tensors="pt"))[0]
+    negative_condition = \
+        MMM.get_text_features(**CLAP_tokenizer([negative_prompts], padding=True, return_tensors="pt"))[0]
+    mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy, mute=True)
+    mySampler.activate_classifier_free_guidance(CFG, negative_condition)
+    mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
+    condition = text2sound_embedding.repeat(batchsize, 1)
+    guidance = guidance.repeat(batchsize, 1, 1, 1).to(device)
+    # mask = 1, freeze
+    latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device)
+    latent_mask[:, :, :, -int(time_resolution * (1 / 4) / VAE_scale):] = 1.0
+    latent_representations, initial_noise = \
+        mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width),
+                                  noising_strength=noising_strength,
+                                  guide_img=guidance, mask=latent_mask, return_tensor=True,
+                                  condition=condition, sampler=sampler,
+                                  use_dynamic_mask=use_dynamic_mask,
+                                  end_noise_level_ratio=0.0,
+                                  mask_flexivity=mask_flexivity)
+    latent_representations = latent_representations[-1]
+    quantized_latent_representations, _, (_, _, _) = VAE_quantizer(latent_representations)
+    if return_latent:
+        return quantized_latent_representations.detach()
+    reconstruction_batch = VAE_decoder(quantized_latent_representations).to("cpu").detach().numpy()
+    time_resolution = int(time_resolution * ((duration+1) / 4))
+    rec_signals = nnData2Audio(reconstruction_batch, resolution=(freq_resolution, time_resolution))
+    rec_signals = [rms_normalize(rec_signal) for rec_signal in rec_signals]
+    return quantized_latent_representations.detach(), reconstruction_batch, rec_signals
+def generate_audios_with_diffuSynth_sample(device, uNet, VAE, MMM, CLAP_tokenizer, num_batches, positive_prompts, negative_prompts="", CFG=6, sample_steps=10):
+    diffuSynth_signals = []
+    for _ in tqdm(range(num_batches)):
+        _, _, signals = sample_pipeline(device, uNet, VAE, MMM, CLAP_tokenizer,
+                                        positive_prompts=positive_prompts, negative_prompts=negative_prompts,
+                                                      batchsize=16, sample_steps=sample_steps, CFG=CFG, seed=None, return_latent=False)
+        diffuSynth_signals.extend(signals)
+    return np.array(diffuSynth_signals)
+def generate_audios_with_diffuSynth_inpaint(device, uNet, VAE, MMM, CLAP_tokenizer, num_batches, guidance, duration, use_dynamic_mask, noising_strength, positive_prompts, negative_prompts="", CFG=6, sample_steps=10):
+    diffuSynth_signals = []
+    for _ in tqdm(range(num_batches)):
+        _, _, signals = inpaint_pipeline(device, uNet, VAE, MMM, CLAP_tokenizer,
+                                         use_dynamic_mask=use_dynamic_mask, noising_strength=noising_strength, guidance=guidance,
+                    positive_prompts=positive_prompts, negative_prompts=negative_prompts, batchsize=16, sample_steps=sample_steps, CFG=CFG, seed=None, duration=duration, mask_flexivity=0.999,
+                    return_latent=False)
+        diffuSynth_signals.extend(signals)
+    return np.array(diffuSynth_signals)

metrics/pipelines_STFT.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import librosa
+import numpy as np
+import torch
+from tqdm import tqdm
+from tools import rms_normalize, decode_stft, depad_STFT
+from model.DiffSynthSampler import DiffSynthSampler
+def sample_pipeline_STFT(device, uNet, VAE, MMM, CLAP_tokenizer,
+                    positive_prompts, negative_prompts, batchsize, sample_steps, CFG, seed=None,
+                    freq_resolution=512, time_resolution=256, channels=4, VAE_scale=4, timesteps=1000, noise_strategy="repeat", sampler="ddim", return_latent=True):
+    "Sample a fix-length audio using a diffusion model, including 'ISTFT+' post-processing."
+    height = int(freq_resolution/VAE_scale)
+    width = int(time_resolution/VAE_scale)
+    VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+    text2sound_embedding = \
+        MMM.get_text_features(**CLAP_tokenizer([positive_prompts], padding=True, return_tensors="pt"))[0].to(device)
+    negative_condition = \
+        MMM.get_text_features(**CLAP_tokenizer([negative_prompts], padding=True, return_tensors="pt"))[
+            0].to(device)
+    mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy, mute=True)
+    mySampler.activate_classifier_free_guidance(CFG, negative_condition)
+    mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
+    condition = text2sound_embedding.repeat(batchsize, 1)
+    latent_representations, initial_noise = \
+    mySampler.sample(model=uNet, shape=(batchsize, channels, height, width), seed=seed,
+                      return_tensor=True, condition=condition, sampler=sampler)
+    latent_representations = latent_representations[-1]
+    quantized_latent_representations, _, (_, _, _) = VAE_quantizer(latent_representations)
+    if return_latent:
+        return quantized_latent_representations.detach()
+    reconstruction_batch = VAE_decoder(quantized_latent_representations).to("cpu").detach().numpy()
+    rec_signals = []
+    for index, STFT in enumerate(reconstruction_batch):
+        padded_D_rec = decode_stft(STFT)
+        D_rec = depad_STFT(padded_D_rec)
+        # get_audio
+        rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
+        rec_signals.append(rms_normalize(rec_signal))
+    return quantized_latent_representations.detach(), reconstruction_batch, rec_signals
+def sample_pipeline_GAN_STFT(device, gan_generator, VAE, MMM, CLAP_tokenizer,
+                    positive_prompts, negative_prompts, batchsize, sample_steps, CFG, seed=None,
+                    freq_resolution=512, time_resolution=256, channels=4, VAE_scale=4, timesteps=1000, noise_strategy="repeat", sampler="ddim", return_latent=True):
+    "Sample fix-length audio using a GAN, including 'ISTFT+' post-processing."
+    height = int(freq_resolution/VAE_scale)
+    width = int(time_resolution/VAE_scale)
+    VAE_encoder, VAE_quantizer, VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+    text2sound_embedding = \
+        MMM.get_text_features(**CLAP_tokenizer([positive_prompts], padding=True, return_tensors="pt"))[0].to(device)
+    condition = text2sound_embedding.repeat(batchsize, 1)
+    noise = torch.randn(batchsize, channels, height, width).to(device)
+    latent_representations = gan_generator(noise, condition)
+    quantized_latent_representations, _, (_, _, _) = VAE_quantizer(latent_representations)
+    if return_latent:
+        return quantized_latent_representations.detach()
+    reconstruction_batch = VAE_decoder(quantized_latent_representations).to("cpu").detach().numpy()
+    rec_signals = []
+    for index, STFT in enumerate(reconstruction_batch):
+        padded_D_rec = decode_stft(STFT)
+        D_rec = depad_STFT(padded_D_rec)
+        # get_audio
+        rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
+        rec_signals.append(rms_normalize(rec_signal))
+    return quantized_latent_representations.detach(), reconstruction_batch, rec_signals
+def generate_audios_with_diffuSynth_sample(device, uNet, VAE, MMM, CLAP_tokenizer, num_batches, positive_prompts, negative_prompts="", CFG=6, sample_steps=10):
+    "Sample audios using a diffusion model, including 'ISTFT+' post-processing."
+    diffuSynth_signals = []
+    for _ in tqdm(range(num_batches)):
+        _, _, signals = sample_pipeline_STFT(device, uNet, VAE, MMM, CLAP_tokenizer,
+                                        positive_prompts=positive_prompts, negative_prompts=negative_prompts,
+                                                      batchsize=8, sample_steps=sample_steps, CFG=CFG, seed=None, return_latent=False)
+        diffuSynth_signals.extend(signals)
+    return np.array(diffuSynth_signals)

metrics/precision_recall.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# This work is licensed under the Creative Commons Attribution-NonCommercial
+# 4.0 International License. To view a copy of this license, visit
+# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
+# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+"""k-NN precision and recall."""
+from time import time
+# ----------------------------------------------------------------------------
+import numpy as np
+from tqdm import tqdm
+def batch_pairwise_distances(U, V):
+    """Compute pair-wise distance in a batch of feature."""
+    norm_u = np.sum(np.square(U), axis=1)
+    norm_v = np.sum(np.square(V), axis=1)
+    norm_u = np.reshape(norm_u, [-1, 1])
+    norm_v = np.reshape(norm_v, [1, -1])
+    D = np.maximum(norm_u - 2 * np.dot(U, V.T) + norm_v, 0.0)
+    return D
+# ----------------------------------------------------------------------------
+class DistanceBlock():
+    """Compute pair-wise distance in a batch of feature."""
+    def __init__(self, num_features):
+        self.num_features = num_features
+    def pairwise_distances(self, U, V):
+        return batch_pairwise_distances(U, V)
+# ----------------------------------------------------------------------------
+class ManifoldEstimator():
+    """Estimates the manifold of given feature vectors."""
+    def __init__(self, distance_block, features, row_batch_size=16, col_batch_size=16,
+                 nhood_sizes=[3], clamp_to_percentile=None, eps=1e-5, mute=False):
+        """Estimate the manifold of given feature vectors.
+            Args:
+                distance_block: DistanceBlock object that distributes pairwise distance
+                    calculation to multiple GPUs.
+                features (np.array/tf.Tensor): Matrix of feature vectors to estimate their manifold.
+                row_batch_size (int): Row batch size to compute pairwise distances
+                    (parameter to trade-off between memory usage and performance).
+                col_batch_size (int): Column batch size to compute pairwise distances.
+                nhood_sizes (list): Number of neighbors used to estimate the manifold.
+                clamp_to_percentile (float): Prune hyperspheres that have radius larger than
+                    the given percentile.
+                eps (float): Small number for numerical stability.
+        """
+        num_images = features.shape[0]
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.eps = eps
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self._ref_features = features
+        self._distance_block = distance_block
+        self.mute = mute
+        # Estimate manifold of features by calculating distances to k-NN of each sample.
+        self.D = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
+        distance_batch = np.zeros([row_batch_size, num_images], dtype=np.float32)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+        if mute:
+            for begin1 in range(0, num_images, row_batch_size):
+                end1 = min(begin1 + row_batch_size, num_images)
+                row_batch = features[begin1:end1]
+                for begin2 in range(0, num_images, col_batch_size):
+                    end2 = min(begin2 + col_batch_size, num_images)
+                    col_batch = features[begin2:end2]
+                    # Compute distances between batches.
+                    distance_batch[0:end1 - begin1, begin2:end2] = self._distance_block.pairwise_distances(row_batch,
+                                                                                                           col_batch)
+                # Find the k-nearest neighbor from the current batch.
+                self.D[begin1:end1, :] = np.partition(distance_batch[0:end1 - begin1, :], seq, axis=1)[:, self.nhood_sizes]
+        else:
+            for begin1 in tqdm(range(0, num_images, row_batch_size)):
+                end1 = min(begin1 + row_batch_size, num_images)
+                row_batch = features[begin1:end1]
+                for begin2 in range(0, num_images, col_batch_size):
+                    end2 = min(begin2 + col_batch_size, num_images)
+                    col_batch = features[begin2:end2]
+                    # Compute distances between batches.
+                    distance_batch[0:end1 - begin1, begin2:end2] = self._distance_block.pairwise_distances(row_batch,
+                                                                                                           col_batch)
+                # Find the k-nearest neighbor from the current batch.
+                self.D[begin1:end1, :] = np.partition(distance_batch[0:end1 - begin1, :], seq, axis=1)[:, self.nhood_sizes]
+        if clamp_to_percentile is not None:
+            max_distances = np.percentile(self.D, clamp_to_percentile, axis=0)
+            self.D[self.D > max_distances] = 0
+    def evaluate(self, eval_features, return_realism=False, return_neighbors=False):
+        """Evaluate if new feature vectors are at the manifold."""
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = self.D.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        max_realism_score = np.zeros([num_eval_images, ], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images, ], dtype=np.int32)
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = self._ref_features[begin2:end2]
+                distance_batch[0:end1 - begin1, begin2:end2] = self._distance_block.pairwise_distances(feature_batch,
+                                                                                                       ref_batch)
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then
+            # the new sample lies at the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0:end1 - begin1, :, None] <= self.D
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+            max_realism_score[begin1:end1] = np.max(self.D[:, 0] / (distance_batch[0:end1 - begin1, :] + self.eps),
+                                                    axis=1)
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0:end1 - begin1, :], axis=1)
+        if return_realism and return_neighbors:
+            return batch_predictions, max_realism_score, nearest_indices
+        elif return_realism:
+            return batch_predictions, max_realism_score
+        elif return_neighbors:
+            return batch_predictions, nearest_indices
+        return batch_predictions
+# ----------------------------------------------------------------------------
+def knn_precision_recall_features(ref_features, eval_features, nhood_sizes=[3],
+                                  row_batch_size=10000, col_batch_size=50000, mute=False):
+    """Calculates k-NN precision and recall for two sets of feature vectors.
+        Args:
+            ref_features (np.array/tf.Tensor): Feature vectors of reference images.
+            eval_features (np.array/tf.Tensor): Feature vectors of generated images.
+            nhood_sizes (list): Number of neighbors used to estimate the manifold.
+            row_batch_size (int): Row batch size to compute pairwise distances
+                (parameter to trade-off between memory usage and performance).
+            col_batch_size (int): Column batch size to compute pairwise distances.
+            num_gpus (int): Number of GPUs used to evaluate precision and recall.
+        Returns:
+            State (dict): Dict that contains precision and recall calculated from
+            ref_features and eval_features.
+    """
+    state = dict()
+    num_images = ref_features.shape[0]
+    num_features = ref_features.shape[1]
+    # Initialize DistanceBlock and ManifoldEstimators.
+    distance_block = DistanceBlock(num_features)
+    ref_manifold = ManifoldEstimator(distance_block, ref_features, row_batch_size, col_batch_size, nhood_sizes, mute=mute)
+    eval_manifold = ManifoldEstimator(distance_block, eval_features, row_batch_size, col_batch_size, nhood_sizes, mute=mute)
+    # Evaluate precision and recall using k-nearest neighbors.
+    if not mute:
+        print('Evaluating k-NN precision and recall with %i samples...' % num_images)
+    start = time()
+    # Precision: How many points from eval_features are in ref_features manifold.
+    precision = ref_manifold.evaluate(eval_features)
+    state['precision'] = precision.mean(axis=0)
+    # Recall: How many points from ref_features are in eval_features manifold.
+    recall = eval_manifold.evaluate(ref_features)
+    state['recall'] = recall.mean(axis=0)
+    if not mute:
+        print('Evaluated k-NN precision and recall in: %gs' % (time() - start))
+    return state
+# ----------------------------------------------------------------------------

metrics/visualizations.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import numpy as np
+from matplotlib import pyplot as plt
+from scipy.fft import fft
+from scipy.signal import savgol_filter
+from tools import rms_normalize
+colors = [
+    # (0, 0, 0),       # Black
+    # (86, 180, 233),  # Sky blue
+    # (240, 228, 66),  # Yellow
+    # (204, 121, 167),  # Reddish purple
+    (213, 94, 0),  # Vermilion
+    (0, 114, 178),  # Blue
+    (230, 159, 0),  # Orange
+    (0, 158, 115),  # Bluish green
+]
+def plot_psd_multiple_signals(signals_list, labels_list, sample_rate=16000, window_size=500,
+                              figsize=(10, 6), save_path=None, normalize=False):
+    """
+    在同一张图上绘制多组音频信号的功率谱密度比较图，使用对数刻度的响度轴（以2为底），并应用平滑处理。
+    参数:
+    signals_list: 包含多组音频信号的列表，每组信号形状为 [sample_number, sample_length] 的numpy array
+    labels_list: 每组音频信号对应的标签字符串列表
+    sample_rate: 音频的采样率
+    """
+    # 确保传入的signals_list和labels_list长度相同
+    assert len(signals_list) == len(labels_list), "每组信号必须有一个对应的标签。"
+    signals_list = [np.array([rms_normalize(signal) for signal in signals]) for signals in signals_list]
+    # 绘图准备
+    plt.figure(figsize=figsize)
+    # 遍历所有的音频信号
+    i = 0
+    for signal, label in zip(signals_list, labels_list):
+        # 计算FFT
+        fft_signal = fft(signal, axis=1)
+        # 计算平均功率谱密度
+        psd_signal = np.mean(np.abs(fft_signal)**2, axis=0)
+        # 计算频率轴
+        freqs = np.fft.fftfreq(signal.shape[1], 1/sample_rate)
+        # 应用Savitzky-Golay滤波器进行平滑
+        psd_smoothed = savgol_filter(np.log2(psd_signal[:signal.shape[1] // 2] + 1), window_size, 3)  # 窗口大小51, 多项式阶数3
+        # Normalize each curve if normalize is True
+        if normalize:
+            psd_smoothed /= np.mean(psd_smoothed)
+        # 绘制每组信号的功率谱密度
+        plt.plot(freqs[:signal.shape[1] // 2], psd_smoothed, label=label, color=[x/255.0 for x in colors[i % len(colors)]], linewidth=1)
+        i += 1
+    # 设置图表元素
+    plt.xlabel('Frequency (Hz)')
+    plt.ylabel('Mean Log-Amplitude')
+    plt.legend()
+    # 根据save_path参数决定保存图像还是直接显示
+    if save_path:
+        plt.savefig(save_path)
+    else:
+        plt.show()
+def plot_amplitude_over_time(signals_list, labels_list, sample_rate=16000, window_size=500,
+                             figsize=(10, 6), save_path=None, normalize=False, start_time=0):
+    """
+    Plot the loudness of multiple sets of audio signals over time on the same graph,
+    using a logarithmic scale for the loudness axis (base 2), with smoothing applied.
+    Parameters:
+    signals_list: List of sets of audio signals, each set is a numpy array with shape [sample_number, sample_length]
+    labels_list: List of labels corresponding to each set of audio signals
+    sample_rate: Sampling rate of the audio
+    window_size: Window size for the Savitzky-Golay filter
+    figsize: Figure size
+    save_path: Path to save the figure, if None, the figure will be displayed
+    normalize: Whether to normalize each curve so that the sum of each curve is the same
+    start_time: Time (in seconds) to start plotting, only data after this time will be retained
+    """
+    assert len(signals_list) == len(labels_list), f"len(signals_list) != len(labels_list) for " \
+                                                  f"len(signals_list) = {len(signals_list)} and len(labels_list) = {len(labels_list)}"
+    # Compute starting sample index
+    start_sample = int(start_time * sample_rate)
+    # Normalize signals and truncate data
+    signals_list = [np.array([rms_normalize(signal)[start_sample:] for signal in signals]) for signals in signals_list]
+    time_axis = np.arange(start_sample, start_sample + signals_list[0].shape[1]) / sample_rate
+    plt.figure(figsize=figsize)
+    i = 0
+    for signal, label in zip(signals_list, labels_list):
+        amplitude_mean = np.mean(np.abs(signal), axis=0)
+        amplitude_smoothed = savgol_filter(np.log2(amplitude_mean + 1), window_size, 3)
+        # Normalize each curve if normalize is True
+        if normalize:
+            amplitude_smoothed /= np.mean(amplitude_smoothed)
+        plt.plot(time_axis, amplitude_smoothed, label=label, color=[x/255.0 for x in colors[i % len(colors)]], linewidth=1)
+        i += 1
+    plt.xlabel('Time (seconds)')
+    plt.ylabel('Mean Log-Amplitude')
+    plt.legend()
+    # Save or show the figure based on save_path parameter
+    if save_path:
+        plt.savefig(save_path)
+    else:
+        plt.show()