Spaces:
Running
Running
Refactor SVS evaluation and move SingMOS functions to svs_eval.py; add pitch interval and chroma entropy
Browse files- server.py +3 -4
- svs_eval.py +113 -0
- svs_utils.py +0 -17
server.py
CHANGED
|
@@ -3,9 +3,7 @@ from fastapi.responses import FileResponse, JSONResponse
|
|
| 3 |
import base64
|
| 4 |
import argparse
|
| 5 |
import librosa
|
| 6 |
-
import torch
|
| 7 |
import tempfile
|
| 8 |
-
import os
|
| 9 |
from transformers import pipeline
|
| 10 |
import re
|
| 11 |
from svs_utils import svs_warmup, svs_inference
|
|
@@ -14,7 +12,8 @@ import soundfile as sf
|
|
| 14 |
from pypinyin import lazy_pinyin
|
| 15 |
import jiwer
|
| 16 |
import librosa
|
| 17 |
-
from svs_utils import
|
|
|
|
| 18 |
|
| 19 |
app = FastAPI()
|
| 20 |
|
|
@@ -49,7 +48,7 @@ config = argparse.Namespace(
|
|
| 49 |
|
| 50 |
# load model
|
| 51 |
svs_model = svs_warmup(config)
|
| 52 |
-
predictor
|
| 53 |
sample_rate = 44100
|
| 54 |
|
| 55 |
# load dataset for random_select
|
|
|
|
| 3 |
import base64
|
| 4 |
import argparse
|
| 5 |
import librosa
|
|
|
|
| 6 |
import tempfile
|
|
|
|
| 7 |
from transformers import pipeline
|
| 8 |
import re
|
| 9 |
from svs_utils import svs_warmup, svs_inference
|
|
|
|
| 12 |
from pypinyin import lazy_pinyin
|
| 13 |
import jiwer
|
| 14 |
import librosa
|
| 15 |
+
from svs_utils import load_song_database, estimate_sentence_length
|
| 16 |
+
from svs_eval import singmos_warmup, singmos_evaluation
|
| 17 |
|
| 18 |
app = FastAPI()
|
| 19 |
|
|
|
|
| 48 |
|
| 49 |
# load model
|
| 50 |
svs_model = svs_warmup(config)
|
| 51 |
+
predictor = singmos_warmup()
|
| 52 |
sample_rate = 44100
|
| 53 |
|
| 54 |
# load dataset for random_select
|
svs_eval.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import pyworld as pw
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def singmos_warmup():
|
| 8 |
+
predictor = torch.hub.load(
|
| 9 |
+
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
| 10 |
+
)
|
| 11 |
+
return predictor
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def singmos_evaluation(predictor, wav_info, fs):
|
| 15 |
+
wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
|
| 16 |
+
wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
|
| 17 |
+
len_mos = torch.tensor([wav_mos.shape[1]])
|
| 18 |
+
score = predictor(wav_mos, len_mos)
|
| 19 |
+
return score
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def pitch_interval_evaluation(y, fs):
|
| 23 |
+
_f0, t = pw.dio(y.astype(np.float64), fs)
|
| 24 |
+
f0 = pw.stonemask(y.astype(np.float64), _f0, t, fs)
|
| 25 |
+
|
| 26 |
+
f0[f0 == 0] = np.nan
|
| 27 |
+
midi_f0 = librosa.hz_to_midi(f0)
|
| 28 |
+
|
| 29 |
+
if len(midi_f0) < 2:
|
| 30 |
+
return np.nan, np.nan
|
| 31 |
+
|
| 32 |
+
# only consider the intervals between notes
|
| 33 |
+
intervals = np.diff(midi_f0)
|
| 34 |
+
intervals = intervals[~np.isnan(intervals)]
|
| 35 |
+
interval_mean = np.mean(np.abs(intervals))
|
| 36 |
+
interval_std = np.std(intervals)
|
| 37 |
+
return interval_mean, interval_std
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def chroma_entropy_evaluation(y, fs):
|
| 41 |
+
chroma = librosa.feature.chroma_cqt(y=y, sr=fs)
|
| 42 |
+
chroma_sum = np.sum(chroma, axis=0, keepdims=True)
|
| 43 |
+
chroma_sum = np.clip(chroma_sum, 1e-6, None)
|
| 44 |
+
chroma_norm = chroma / chroma_sum
|
| 45 |
+
chroma_norm = np.clip(chroma_norm, 1e-6, 1.0)
|
| 46 |
+
entropy = np.sum(chroma_norm * np.log2(chroma_norm), axis=0)
|
| 47 |
+
return np.mean(entropy)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
import argparse
|
| 52 |
+
from pathlib import Path
|
| 53 |
+
|
| 54 |
+
parser = argparse.ArgumentParser()
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
"--wav_path",
|
| 57 |
+
type=Path,
|
| 58 |
+
help="Path to the wav file",
|
| 59 |
+
)
|
| 60 |
+
parser.add_argument(
|
| 61 |
+
"--results_csv",
|
| 62 |
+
type=Path,
|
| 63 |
+
help="csv file to save the results",
|
| 64 |
+
)
|
| 65 |
+
parser.parse_args()
|
| 66 |
+
|
| 67 |
+
args = parser.parse_args()
|
| 68 |
+
|
| 69 |
+
args.results_csv.parent.mkdir(parents=True, exist_ok=True)
|
| 70 |
+
|
| 71 |
+
y, fs = librosa.load(args.wav_path, sr=None)
|
| 72 |
+
|
| 73 |
+
# warmup
|
| 74 |
+
predictor = singmos_warmup()
|
| 75 |
+
|
| 76 |
+
# singmos evaluation
|
| 77 |
+
score = singmos_evaluation(predictor, y, fs)
|
| 78 |
+
|
| 79 |
+
# pitch interval evaluation
|
| 80 |
+
interval_mean, interval_std = pitch_interval_evaluation(y, fs)
|
| 81 |
+
# chroma entropy evaluation
|
| 82 |
+
chroma_entropy = chroma_entropy_evaluation(y, fs)
|
| 83 |
+
|
| 84 |
+
# # visualize
|
| 85 |
+
# import matplotlib.pyplot as plt
|
| 86 |
+
# import librosa.display
|
| 87 |
+
# chroma = librosa.feature.chroma_cqt(y=y, sr=fs)
|
| 88 |
+
# img = librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
|
| 89 |
+
# plt.colorbar(img)
|
| 90 |
+
# plt.savefig(args.results_csv.parent / args.wav_path.with_suffix('.png'))
|
| 91 |
+
|
| 92 |
+
# save results
|
| 93 |
+
results = {
|
| 94 |
+
"singmos": score,
|
| 95 |
+
"pitch_interval_mean": interval_mean,
|
| 96 |
+
"pitch_interval_std": interval_std,
|
| 97 |
+
"chroma_entropy": chroma_entropy,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
with open(args.results_csv, "a") as f:
|
| 101 |
+
header = "file," + ",".join(results.keys()) + "\n"
|
| 102 |
+
if f.tell() == 0:
|
| 103 |
+
f.write(header)
|
| 104 |
+
else:
|
| 105 |
+
with open(args.results_csv, "r") as f2:
|
| 106 |
+
file_header = f2.readline()
|
| 107 |
+
if file_header != header:
|
| 108 |
+
raise ValueError(
|
| 109 |
+
f"Header mismatch: {file_header} vs {header}"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
line = ",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n"
|
| 113 |
+
f.write(line)
|
svs_utils.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
import random
|
| 3 |
|
| 4 |
-
import librosa
|
| 5 |
import numpy as np
|
| 6 |
-
import torch
|
| 7 |
from espnet2.bin.svs_inference import SingingGenerate
|
| 8 |
from espnet_model_zoo.downloader import ModelDownloader
|
| 9 |
|
|
@@ -227,21 +225,6 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
|
|
| 227 |
return wav_info
|
| 228 |
|
| 229 |
|
| 230 |
-
def singmos_warmup():
|
| 231 |
-
predictor = torch.hub.load(
|
| 232 |
-
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
| 233 |
-
)
|
| 234 |
-
return predictor, "South-Twilight/SingMOS:v0.2.0"
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
def singmos_evaluation(predictor, wav_info, fs):
|
| 238 |
-
wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
|
| 239 |
-
wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
|
| 240 |
-
len_mos = torch.tensor([wav_mos.shape[1]])
|
| 241 |
-
score = predictor(wav_mos, len_mos)
|
| 242 |
-
return score
|
| 243 |
-
|
| 244 |
-
|
| 245 |
def estimate_sentence_length(query, config, song2note_lengths):
|
| 246 |
if config.melody_source == "random_select.touhou":
|
| 247 |
song_name = "touhou"
|
|
|
|
| 1 |
import json
|
| 2 |
import random
|
| 3 |
|
|
|
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
from espnet2.bin.svs_inference import SingingGenerate
|
| 6 |
from espnet_model_zoo.downloader import ModelDownloader
|
| 7 |
|
|
|
|
| 225 |
return wav_info
|
| 226 |
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
def estimate_sentence_length(query, config, song2note_lengths):
|
| 229 |
if config.melody_source == "random_select.touhou":
|
| 230 |
song_name = "touhou"
|