Spaces:
Sleeping
Sleeping
avoice-dev commited on
Commit ·
7926ab4
1
Parent(s): 58995dc
fix(common): deploy timbral models analyzer
Browse files- app.py +44 -0
- requirements.txt +7 -0
- timbral_models/Timbral_Booming.py +156 -0
- timbral_models/Timbral_Brightness.py +186 -0
- timbral_models/Timbral_Depth.py +289 -0
- timbral_models/Timbral_Extractor.py +140 -0
- timbral_models/Timbral_Hardness.py +274 -0
- timbral_models/Timbral_Reverb.py +370 -0
- timbral_models/Timbral_Roughness.py +185 -0
- timbral_models/Timbral_Sharpness.py +120 -0
- timbral_models/Timbral_Warmth.py +263 -0
- timbral_models/__init__.py +12 -0
- timbral_models/__pycache__/Timbral_Booming.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Brightness.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Depth.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Extractor.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Hardness.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Reverb.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Roughness.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Sharpness.cpython-310.pyc +0 -0
- timbral_models/__pycache__/Timbral_Warmth.cpython-310.pyc +0 -0
- timbral_models/__pycache__/__init__.cpython-310.pyc +0 -0
- timbral_models/__pycache__/timbral_util.cpython-310.pyc +0 -0
- timbral_models/timbral_util.py +1816 -0
app.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from timbral_models.Timbral_Extractor import timbral_extractor
|
| 3 |
+
|
| 4 |
+
def main_timbre_iosr_analysis(in_files):
|
| 5 |
+
if type(in_files) is not list:
|
| 6 |
+
in_files = [in_files]
|
| 7 |
+
output_text = ''
|
| 8 |
+
|
| 9 |
+
for file_path in in_files:
|
| 10 |
+
if file_path:
|
| 11 |
+
timbre = timbral_extractor(file_path)
|
| 12 |
+
output_text +=f'----- Комплексные характеристики тембра ----- \n'
|
| 13 |
+
output_text +=f'1. Глубина, depth (%): {round(timbre.get("depth", ""),2)} \n'
|
| 14 |
+
output_text +=f'2. Яркость, brightness (%): {round(timbre.get("brightness", ""),2)} \n'
|
| 15 |
+
output_text +=f'3. Теплота, warmth (%): {round(timbre.get("warmth", ""),2)} \n'
|
| 16 |
+
output_text +=f'4. Жесткость, hardness (%): {round(timbre.get("hardness", ""),2)} \n'
|
| 17 |
+
output_text +=f'5. Резкость, sharpness (%): {round(timbre.get("sharpness", ""),2)} \n'
|
| 18 |
+
output_text +=f'6. Шершавость, roughness (%): {round(timbre.get("roughness", ""),2)} \n'
|
| 19 |
+
output_text +=f'7. Гулкость, boominess (%): {round(timbre.get("boominess", ""),2)} \n'
|
| 20 |
+
output_text +=f'8. Реверберация, reverb (0-1): {timbre.get("reverb", "")} \n'
|
| 21 |
+
|
| 22 |
+
return output_text
|
| 23 |
+
|
| 24 |
+
def timbre_iosr_analysis(in_files):
|
| 25 |
+
output_text = main_timbre_iosr_analysis(in_files)
|
| 26 |
+
return output_text
|
| 27 |
+
|
| 28 |
+
iface = gr.Interface(
|
| 29 |
+
fn = timbre_iosr_analysis,
|
| 30 |
+
inputs=[
|
| 31 |
+
gr.Audio(type="filepath", label="Загрузить аудио файл"),
|
| 32 |
+
],
|
| 33 |
+
outputs=[
|
| 34 |
+
gr.Textbox(label="Результаты"),
|
| 35 |
+
],
|
| 36 |
+
title = "Анализатор \"Характеристики тембра\"",
|
| 37 |
+
description = "Выбирите аудиофайл для анализа, дождитесь его загрузки в окне прослушивания (слева). Затем нажмите кнопку \"Запустить\". \n Дождитесь появление результатов в окне вывода (справа).",
|
| 38 |
+
submit_btn = "Запустить",
|
| 39 |
+
clear_btn = "Очистить",
|
| 40 |
+
allow_flagging="never"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
iface.launch(debug=True, share=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy
|
| 2 |
+
soundfile
|
| 3 |
+
librosa
|
| 4 |
+
scipy
|
| 5 |
+
scikit-learn
|
| 6 |
+
six
|
| 7 |
+
pyloudnorm
|
timbral_models/Timbral_Booming.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from . import timbral_util
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def boominess_calculate(loudspec):
|
| 8 |
+
"""
|
| 9 |
+
Calculates the Booming Index as described by Hatano, S., and Hashimoto, T. "Booming index as a measure for
|
| 10 |
+
evaluating booming sensation", The 29th International congress and Exhibition on Noise Control Engineering, 2000.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
# loudspec from the loudness_1991 code results in values from 0.1 to 24 Bark in 0.1 steps
|
| 14 |
+
z = np.arange(0.1, 24.05, 0.1) #0.1 to 24 bark in 0.1 steps
|
| 15 |
+
f = 600 * np.sinh(z / 6.0) # convert these bark values to frequency
|
| 16 |
+
FR = [25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, 2000, 2500,
|
| 17 |
+
3150, 4000, 5000, 6300, 8000, 10000, 12500] # get the centre frequencies of 3rd octave bands
|
| 18 |
+
|
| 19 |
+
# now I need to convert f onto the FR scale
|
| 20 |
+
logFR = np.log10(FR)
|
| 21 |
+
FR_step = logFR[1] - logFR[0] # get the step size on the log scale
|
| 22 |
+
FR_min = logFR[0] # get the minimum value of the logFR
|
| 23 |
+
|
| 24 |
+
logf = np.log10(f) # get the log version of estimated frequencies
|
| 25 |
+
# estimate the indexes of the bark scale on the 3rd octave scale
|
| 26 |
+
estimated_index = ((logf - FR_min) / float(FR_step)) + 1
|
| 27 |
+
|
| 28 |
+
# weighting function based from the estimated indexes
|
| 29 |
+
Weighting_function = 2.13 * np.exp(-0.151 * estimated_index)
|
| 30 |
+
|
| 31 |
+
# change the LF indexes to roll off
|
| 32 |
+
Weighting_function[0] = 0.8 # this value is estimated
|
| 33 |
+
Weighting_function[1] = 1.05
|
| 34 |
+
Weighting_function[2] = 1.10
|
| 35 |
+
Weighting_function[3] = 1.18
|
| 36 |
+
|
| 37 |
+
# identify index where frequency is less than 280Hz
|
| 38 |
+
below_280_idx = np.where(f >= 280)[0][0]
|
| 39 |
+
|
| 40 |
+
I = loudspec * Weighting_function
|
| 41 |
+
loudness = np.sum(loudspec)
|
| 42 |
+
Ll = np.sum(loudspec[:below_280_idx])
|
| 43 |
+
|
| 44 |
+
Bandsum = timbral_util.log_sum(I)
|
| 45 |
+
BoomingIndex = Bandsum * (Ll / loudness)
|
| 46 |
+
|
| 47 |
+
return BoomingIndex
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def timbral_booming(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False):
|
| 51 |
+
"""
|
| 52 |
+
This is an implementation of the hasimoto booming index feature.
|
| 53 |
+
There are a few fudge factors with the code to convert between the internal representation of the sound using the
|
| 54 |
+
same loudness calculation as the sharpness code. The equation for calculating the booming index is not
|
| 55 |
+
specifically quoted anywhere so I've done the best i can with the code that was presented.
|
| 56 |
+
|
| 57 |
+
Shin, SH, Ih, JG, Hashimoto, T., and Hatano, S.: "Sound quality evaluation of the booming sensation for passenger
|
| 58 |
+
cars", Applied Acoustics, Vol. 70, 2009.
|
| 59 |
+
|
| 60 |
+
Hatano, S., and Hashimoto, T. "Booming index as a measure for
|
| 61 |
+
evaluating booming sensation", The 29th International congress and Exhibition on Noise Control Engineering, 2000.
|
| 62 |
+
|
| 63 |
+
This function calculates the apparent Boominess of an audio file.
|
| 64 |
+
|
| 65 |
+
This version of timbral_booming contains self loudness normalising methods and can accept arrays as an input
|
| 66 |
+
instead of a string filename.
|
| 67 |
+
|
| 68 |
+
Version 0.4
|
| 69 |
+
|
| 70 |
+
Required parameter
|
| 71 |
+
:param fname: string or numpy array
|
| 72 |
+
string, audio filename to be analysed, including full file path and extension.
|
| 73 |
+
numpy array, array of audio samples, requires fs to be set to the sample rate.
|
| 74 |
+
|
| 75 |
+
Optional parameters
|
| 76 |
+
:param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
|
| 77 |
+
Defaults to 0.
|
| 78 |
+
:param dev_output: bool, when False return the warmth, when True return all extracted features.
|
| 79 |
+
Defaults to False.
|
| 80 |
+
:param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
|
| 81 |
+
Defaults to False.
|
| 82 |
+
:param clip_output: bool, force the output to be between 0 and 100.
|
| 83 |
+
|
| 84 |
+
:return float, apparent boominess of the audio file.
|
| 85 |
+
|
| 86 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 87 |
+
|
| 88 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 89 |
+
you may not use this file except in compliance with the License.
|
| 90 |
+
You may obtain a copy of the License at
|
| 91 |
+
|
| 92 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 93 |
+
|
| 94 |
+
Unless required by applicable law or agreed to in writing, software
|
| 95 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 96 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 97 |
+
See the License for the specific language governing permissions and
|
| 98 |
+
limitations under the License.
|
| 99 |
+
"""
|
| 100 |
+
'''
|
| 101 |
+
Read input
|
| 102 |
+
'''
|
| 103 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# window the audio file into 4096 sample sections
|
| 107 |
+
windowed_audio = timbral_util.window_audio(audio_samples, window_length=4096)
|
| 108 |
+
|
| 109 |
+
windowed_booming = []
|
| 110 |
+
windowed_rms = []
|
| 111 |
+
for i in range(windowed_audio.shape[0]):
|
| 112 |
+
samples = windowed_audio[i, :] # the current time window
|
| 113 |
+
# get the rms value and append to list
|
| 114 |
+
windowed_rms.append(np.sqrt(np.mean(samples * samples)))
|
| 115 |
+
|
| 116 |
+
# calculate the specific loudness
|
| 117 |
+
N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
|
| 118 |
+
|
| 119 |
+
# calculate the booming index is contains a level
|
| 120 |
+
if N_entire > 0:
|
| 121 |
+
# boom = boominess_calculate(N_single)
|
| 122 |
+
BoomingIndex = boominess_calculate(N_single)
|
| 123 |
+
else:
|
| 124 |
+
BoomingIndex = 0
|
| 125 |
+
|
| 126 |
+
windowed_booming.append(BoomingIndex)
|
| 127 |
+
|
| 128 |
+
# get level of low frequencies
|
| 129 |
+
ll, w_ll = timbral_util.weighted_bark_level(audio_samples, fs, 0, 70)
|
| 130 |
+
|
| 131 |
+
ll = np.log10(ll)
|
| 132 |
+
# convert to numpy arrays for fancy indexing
|
| 133 |
+
windowed_booming = np.array(windowed_booming)
|
| 134 |
+
windowed_rms = np.array(windowed_rms)
|
| 135 |
+
|
| 136 |
+
# get the weighted average
|
| 137 |
+
rms_boom = np.average(windowed_booming, weights=(windowed_rms * windowed_rms))
|
| 138 |
+
rms_boom = np.log10(rms_boom)
|
| 139 |
+
|
| 140 |
+
if dev_output:
|
| 141 |
+
return [rms_boom, ll]
|
| 142 |
+
else:
|
| 143 |
+
|
| 144 |
+
# perform thye linear regression
|
| 145 |
+
all_metrics = np.ones(3)
|
| 146 |
+
all_metrics[0] = rms_boom
|
| 147 |
+
all_metrics[1] = ll
|
| 148 |
+
|
| 149 |
+
coefficients = np.array([43.67402696195865, -10.90054738389845, 26.836530575185435])
|
| 150 |
+
|
| 151 |
+
boominess = np.sum(all_metrics * coefficients)
|
| 152 |
+
|
| 153 |
+
if clip_output:
|
| 154 |
+
boominess = timbral_util.output_clip(boominess)
|
| 155 |
+
|
| 156 |
+
return boominess
|
timbral_models/Timbral_Brightness.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from . import timbral_util
|
| 5 |
+
from scipy.signal import spectrogram
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def timbral_brightness(fname, fs=0, dev_output=False, clip_output=False, phase_correction=False, threshold=0,
|
| 9 |
+
ratio_crossover=2000, centroid_crossover=100, stepSize=1024, blockSize=2048, minFreq=20):
|
| 10 |
+
"""
|
| 11 |
+
This function calculates the apparent Brightness of an audio file.
|
| 12 |
+
This version of timbral_brightness contains self loudness normalising methods and can accept arrays as an input
|
| 13 |
+
instead of a string filename.
|
| 14 |
+
|
| 15 |
+
Version 0.4
|
| 16 |
+
|
| 17 |
+
Required parameter
|
| 18 |
+
:param fname: string or numpy array
|
| 19 |
+
string, audio filename to be analysed, including full file path and extension.
|
| 20 |
+
numpy array, array of audio samples, requires fs to be set to the sample rate.
|
| 21 |
+
|
| 22 |
+
Optional parameters
|
| 23 |
+
:param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
|
| 24 |
+
Defaults to 0.
|
| 25 |
+
:param dev_output: bool, when False return the brightness, when True return all extracted features.
|
| 26 |
+
:param clip_output: bool, force the output to be between 0 and 100.
|
| 27 |
+
:param phase_correction: bool, Perform phase checking before summing to mono.
|
| 28 |
+
:param threshold: Threshold below which to ignore the energy in a time window, default to 0.
|
| 29 |
+
:param ratio_crossover: Crossover frequency for calculating the HF energy ratio, default to 2000 Hz.
|
| 30 |
+
:param centroid_crossover: Highpass frequency for calculating the spectral centroid, default to 100 Hz.
|
| 31 |
+
:param stepSize: Step size for calculating spectrogram, default to 1024.
|
| 32 |
+
:param blockSize: Block size (fft length) for calculating spectrogram, default to 2048.
|
| 33 |
+
:param minFreq: Frequency for high-pass filtering audio prior to all analysis, default to 20 Hz.
|
| 34 |
+
|
| 35 |
+
:return: Apparent brightness of audio file, float.
|
| 36 |
+
|
| 37 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 38 |
+
|
| 39 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 40 |
+
you may not use this file except in compliance with the License.
|
| 41 |
+
You may obtain a copy of the License at
|
| 42 |
+
|
| 43 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 44 |
+
|
| 45 |
+
Unless required by applicable law or agreed to in writing, software
|
| 46 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 47 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 48 |
+
See the License for the specific language governing permissions and
|
| 49 |
+
limitations under the License.
|
| 50 |
+
"""
|
| 51 |
+
'''
|
| 52 |
+
Read input
|
| 53 |
+
'''
|
| 54 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 55 |
+
|
| 56 |
+
'''
|
| 57 |
+
Filter audio
|
| 58 |
+
'''
|
| 59 |
+
# highpass audio at minimum frequency
|
| 60 |
+
audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs)
|
| 61 |
+
audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs)
|
| 62 |
+
audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs)
|
| 63 |
+
|
| 64 |
+
# get highpass audio at ratio crossover
|
| 65 |
+
ratio_highpass_audio = timbral_util.filter_audio_highpass(audio_samples, ratio_crossover, fs)
|
| 66 |
+
ratio_highpass_audio = timbral_util.filter_audio_highpass(ratio_highpass_audio, ratio_crossover, fs)
|
| 67 |
+
ratio_highpass_audio = timbral_util.filter_audio_highpass(ratio_highpass_audio, ratio_crossover, fs)
|
| 68 |
+
|
| 69 |
+
# get highpass audio at centroid crossover
|
| 70 |
+
centroid_highpass_audio = timbral_util.filter_audio_highpass(audio_samples, centroid_crossover, fs)
|
| 71 |
+
centroid_highpass_audio = timbral_util.filter_audio_highpass(centroid_highpass_audio, centroid_crossover, fs)
|
| 72 |
+
centroid_highpass_audio = timbral_util.filter_audio_highpass(centroid_highpass_audio, centroid_crossover, fs)
|
| 73 |
+
|
| 74 |
+
'''
|
| 75 |
+
Get spectrograms
|
| 76 |
+
'''
|
| 77 |
+
# normalise audio to the maximum value in the unfiltered audio
|
| 78 |
+
ratio_highpass_audio *= (1.0 / max(abs(audio_samples)))
|
| 79 |
+
centroid_highpass_audio *= (1.0 / max(abs(audio_samples)))
|
| 80 |
+
audio_samples *= (1.0 / max(abs(audio_samples)))
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# set FFT parameters
|
| 84 |
+
nfft = blockSize
|
| 85 |
+
hop_size = int(3 * nfft / 4)
|
| 86 |
+
|
| 87 |
+
# check that audio is long enough to generate spectrograms
|
| 88 |
+
if len(audio_samples) >= nfft:
|
| 89 |
+
# get spectrogram
|
| 90 |
+
ratio_all_freq, ratio_all_time, ratio_all_spec = spectrogram(audio_samples, fs, 'hamming', nfft,
|
| 91 |
+
hop_size, nfft, 'constant', True, 'spectrum')
|
| 92 |
+
ratio_hp_freq, ratio_hp_time, ratio_hp_spec = spectrogram(ratio_highpass_audio, fs, 'hamming', nfft,
|
| 93 |
+
hop_size, nfft, 'constant', True, 'spectrum')
|
| 94 |
+
centroid_hp_freq, centroid_hp_time, centroid_hp_spec = spectrogram(centroid_highpass_audio, fs, 'hamming', nfft,
|
| 95 |
+
hop_size, nfft, 'constant', True, 'spectrum')
|
| 96 |
+
else:
|
| 97 |
+
ratio_all_freq, ratio_all_time, ratio_all_spec = spectrogram(audio_samples, fs, 'hamming',
|
| 98 |
+
len(audio_samples),
|
| 99 |
+
len(audio_samples)-1,
|
| 100 |
+
nfft, 'constant', True, 'spectrum')
|
| 101 |
+
ratio_hp_freq, ratio_hp_time, ratio_hp_spec = spectrogram(ratio_highpass_audio, fs, 'hamming',
|
| 102 |
+
len(ratio_highpass_audio),
|
| 103 |
+
len(ratio_highpass_audio)-1,
|
| 104 |
+
nfft, 'constant', True, 'spectrum')
|
| 105 |
+
centroid_hp_freq, centroid_hp_time, centroid_hp_spec = spectrogram(centroid_highpass_audio, fs, 'hamming',
|
| 106 |
+
len(centroid_highpass_audio),
|
| 107 |
+
len(centroid_highpass_audio)-1,
|
| 108 |
+
nfft, 'constant', True, 'spectrum')
|
| 109 |
+
|
| 110 |
+
# initialise variables for storing data
|
| 111 |
+
all_ratio = []
|
| 112 |
+
all_hp_centroid = []
|
| 113 |
+
all_tpower = []
|
| 114 |
+
all_hp_centroid_tpower = []
|
| 115 |
+
|
| 116 |
+
# set threshold level at zero
|
| 117 |
+
threshold_db = threshold
|
| 118 |
+
if threshold_db == 0:
|
| 119 |
+
threshold = 0
|
| 120 |
+
hp_threshold = 0
|
| 121 |
+
else:
|
| 122 |
+
max_power = max(np.sum(ratio_all_spec, axis=1))
|
| 123 |
+
threshold = max_power * timbral_util.db2mag(threshold_db)
|
| 124 |
+
# get the threshold for centroid
|
| 125 |
+
# centroid_hp_max_power = max(np.sum(centroid_hp_spec, axis=1))
|
| 126 |
+
# hp_min_power = min(np.sum(hp_spec, axis=1))
|
| 127 |
+
# hp_threshold = hp_max_power * timbral_util.db2mag(threshold_db)
|
| 128 |
+
# threshold = 0.0
|
| 129 |
+
|
| 130 |
+
'''
|
| 131 |
+
Calculate features for each time window
|
| 132 |
+
'''
|
| 133 |
+
for idx in range(len(ratio_hp_time)): #
|
| 134 |
+
# get the current spectrum for this time window
|
| 135 |
+
current_ratio_hp_spec = ratio_hp_spec[:, idx]
|
| 136 |
+
current_ratio_all_spec = ratio_all_spec[:, idx]
|
| 137 |
+
current_centroid_hp_spec = centroid_hp_spec[:, idx]
|
| 138 |
+
|
| 139 |
+
# get the power within each spectrum
|
| 140 |
+
tpower = np.sum(current_ratio_all_spec)
|
| 141 |
+
hp_tpower = np.sum(current_ratio_hp_spec)
|
| 142 |
+
# check there is energy in the time window before calculating the ratio (greater than 0)
|
| 143 |
+
if tpower > threshold:
|
| 144 |
+
# get the ratio
|
| 145 |
+
all_ratio.append(hp_tpower / tpower)
|
| 146 |
+
# store the powef for weighting
|
| 147 |
+
all_tpower.append(tpower)
|
| 148 |
+
|
| 149 |
+
# get the tpower to assure greater than zero
|
| 150 |
+
hp_centroid_tpower = np.sum(current_centroid_hp_spec)
|
| 151 |
+
if hp_centroid_tpower > 0.0:
|
| 152 |
+
# get the centroid
|
| 153 |
+
all_hp_centroid.append(np.sum(current_centroid_hp_spec * centroid_hp_freq[:len(current_centroid_hp_spec)]) /
|
| 154 |
+
np.sum(current_centroid_hp_spec))
|
| 155 |
+
# store the tpower for weighting
|
| 156 |
+
all_hp_centroid_tpower.append(hp_centroid_tpower)
|
| 157 |
+
|
| 158 |
+
'''
|
| 159 |
+
Get mean and weighted average values
|
| 160 |
+
'''
|
| 161 |
+
mean_ratio = np.mean(all_ratio)
|
| 162 |
+
mean_hp_centroid = np.mean(all_hp_centroid)
|
| 163 |
+
|
| 164 |
+
weighted_mean_ratio = np.average(all_ratio, weights=all_tpower)
|
| 165 |
+
weighted_mean_hp_centroid = np.average(all_hp_centroid, weights=all_hp_centroid_tpower)
|
| 166 |
+
|
| 167 |
+
if dev_output:
|
| 168 |
+
# return the ratio and centroid
|
| 169 |
+
return np.log10(weighted_mean_ratio), np.log10(weighted_mean_hp_centroid)
|
| 170 |
+
else:
|
| 171 |
+
# perform thye linear regression
|
| 172 |
+
all_metrics = np.ones(3)
|
| 173 |
+
all_metrics[0] = np.log10(weighted_mean_ratio)
|
| 174 |
+
all_metrics[1] = np.log10(weighted_mean_hp_centroid)
|
| 175 |
+
# all_metrics[2] = np.log10(weighted_mean_ratio) * np.log10(weighted_mean_hp_centroid)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
coefficients = np.array([4.613128018020465, 17.378889309312974, 17.434733750553022])
|
| 179 |
+
|
| 180 |
+
# coefficients = np.array([-2.9197705625030235, 9.048261758526614, 3.940747859061009, 47.989783427908705])
|
| 181 |
+
bright = np.sum(all_metrics * coefficients)
|
| 182 |
+
|
| 183 |
+
if clip_output:
|
| 184 |
+
bright = timbral_util.output_clip(bright)
|
| 185 |
+
|
| 186 |
+
return bright
|
timbral_models/Timbral_Depth.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from scipy.signal import spectrogram
|
| 5 |
+
import scipy.stats
|
| 6 |
+
from . import timbral_util
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def timbral_depth(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, threshold_db=-60,
|
| 10 |
+
low_frequency_limit=20, centroid_crossover_frequency=2000, ratio_crossover_frequency=500,
|
| 11 |
+
db_decay_threshold=-40):
|
| 12 |
+
"""
|
| 13 |
+
This function calculates the apparent Depth of an audio file.
|
| 14 |
+
This version of timbral_depth contains self loudness normalising methods and can accept arrays as an input
|
| 15 |
+
instead of a string filename.
|
| 16 |
+
|
| 17 |
+
Version 0.4
|
| 18 |
+
|
| 19 |
+
Required parameter
|
| 20 |
+
:param fname: string or numpy array
|
| 21 |
+
string, audio filename to be analysed, including full file path and extension.
|
| 22 |
+
numpy array, array of audio samples, requires fs to be set to the sample rate.
|
| 23 |
+
|
| 24 |
+
Optional parameters
|
| 25 |
+
:param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
|
| 26 |
+
Defaults to 0.
|
| 27 |
+
:param phase_correction: bool, perform phase checking before summing to mono. Defaults to False.
|
| 28 |
+
:param dev_output: bool, when False return the depth, when True return all extracted
|
| 29 |
+
features. Default to False.
|
| 30 |
+
:param threshold_db: float/int (negative), threshold, in dB, for calculating centroids.
|
| 31 |
+
Should be negative. Defaults to -60.
|
| 32 |
+
:param low_frequency_limit: float/int, low frequency limit at which to highpass filter the audio, in Hz.
|
| 33 |
+
Defaults to 20.
|
| 34 |
+
:param centroid_crossover_frequency: float/int, crossover frequency for calculating the spectral centroid, in Hz.
|
| 35 |
+
Defaults to 2000
|
| 36 |
+
:param ratio_crossover_frequency: float/int, crossover frequency for calculating the ratio, in Hz.
|
| 37 |
+
Defaults to 500.
|
| 38 |
+
|
| 39 |
+
:param db_decay_threshold: float/int (negative), threshold, in dB, for estimating duration. Should be
|
| 40 |
+
negative. Defaults to -40.
|
| 41 |
+
|
| 42 |
+
:return: float, aparent depth of audio file, float.
|
| 43 |
+
|
| 44 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 45 |
+
|
| 46 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 47 |
+
you may not use this file except in compliance with the License.
|
| 48 |
+
You may obtain a copy of the License at
|
| 49 |
+
|
| 50 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 51 |
+
|
| 52 |
+
Unless required by applicable law or agreed to in writing, software
|
| 53 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 54 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 55 |
+
See the License for the specific language governing permissions and
|
| 56 |
+
limitations under the License.
|
| 57 |
+
"""
|
| 58 |
+
'''
|
| 59 |
+
Read input
|
| 60 |
+
'''
|
| 61 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 62 |
+
|
| 63 |
+
'''
|
| 64 |
+
Filter audio
|
| 65 |
+
'''
|
| 66 |
+
# highpass audio - run 3 times to get -18dB per octave - unstable filters produced when using a 6th order
|
| 67 |
+
audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs)
|
| 68 |
+
audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs)
|
| 69 |
+
audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs)
|
| 70 |
+
|
| 71 |
+
# running 3 times to get -18dB per octave rolloff, greater than second order filters are unstable in python
|
| 72 |
+
lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(audio_samples,crossover=centroid_crossover_frequency,fs=fs)
|
| 73 |
+
lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(lowpass_centroid_audio_samples,crossover=centroid_crossover_frequency,fs=fs)
|
| 74 |
+
lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(lowpass_centroid_audio_samples,crossover=centroid_crossover_frequency,fs=fs)
|
| 75 |
+
|
| 76 |
+
lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(audio_samples,crossover=ratio_crossover_frequency,fs=fs)
|
| 77 |
+
lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(lowpass_ratio_audio_samples,crossover=ratio_crossover_frequency,fs=fs)
|
| 78 |
+
lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(lowpass_ratio_audio_samples,crossover=ratio_crossover_frequency,fs=fs)
|
| 79 |
+
|
| 80 |
+
'''
|
| 81 |
+
Get spectrograms and normalise
|
| 82 |
+
'''
|
| 83 |
+
# normalise audio
|
| 84 |
+
lowpass_ratio_audio_samples *= (1.0 / max(abs(audio_samples)))
|
| 85 |
+
lowpass_centroid_audio_samples *= (1.0 / max(abs(audio_samples)))
|
| 86 |
+
audio_samples *= (1.0 / max(abs(audio_samples)))
|
| 87 |
+
|
| 88 |
+
# set FFT parameters
|
| 89 |
+
nfft = 4096
|
| 90 |
+
hop_size = int(3 * nfft / 4)
|
| 91 |
+
# get spectrogram
|
| 92 |
+
if len(audio_samples) > nfft:
|
| 93 |
+
freq, time, spec = spectrogram(audio_samples, fs, 'hamming', nfft, hop_size,
|
| 94 |
+
nfft, 'constant', True, 'spectrum')
|
| 95 |
+
lp_centroid_freq, lp_centroid_time, lp_centroid_spec = spectrogram(lowpass_centroid_audio_samples, fs,
|
| 96 |
+
'hamming', nfft, hop_size, nfft,
|
| 97 |
+
'constant', True, 'spectrum')
|
| 98 |
+
lp_ratio_freq, lp_ratio_time, lp_ratio_spec = spectrogram(lowpass_ratio_audio_samples, fs, 'hamming', nfft,
|
| 99 |
+
hop_size, nfft, 'constant', True, 'spectrum')
|
| 100 |
+
|
| 101 |
+
else:
|
| 102 |
+
# file is shorter than 4096, just take the fft
|
| 103 |
+
freq, time, spec = spectrogram(audio_samples, fs, 'hamming', len(audio_samples), len(audio_samples)-1,
|
| 104 |
+
nfft, 'constant', True, 'spectrum')
|
| 105 |
+
lp_centroid_freq, lp_centroid_time, lp_centroid_spec = spectrogram(lowpass_centroid_audio_samples, fs,
|
| 106 |
+
'hamming',
|
| 107 |
+
len(lowpass_centroid_audio_samples),
|
| 108 |
+
len(lowpass_centroid_audio_samples)-1,
|
| 109 |
+
nfft, 'constant', True, 'spectrum')
|
| 110 |
+
lp_ratio_freq, lp_ratio_time, lp_ratio_spec = spectrogram(lowpass_ratio_audio_samples, fs, 'hamming',
|
| 111 |
+
len(lowpass_ratio_audio_samples),
|
| 112 |
+
len(lowpass_ratio_audio_samples)-1,
|
| 113 |
+
nfft, 'constant', True, 'spectrum')
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
threshold = timbral_util.db2mag(threshold_db)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
'''
|
| 121 |
+
METRIC 1 - limited weighted mean normalised lower centroid
|
| 122 |
+
'''
|
| 123 |
+
# define arrays for storing metrics
|
| 124 |
+
all_normalised_lower_centroid = []
|
| 125 |
+
all_normalised_centroid_tpower = []
|
| 126 |
+
|
| 127 |
+
# get metrics for each time segment of the spectrogram
|
| 128 |
+
for idx in range(len(time)):
|
| 129 |
+
# get overall spectrum of time frame
|
| 130 |
+
current_spectrum = spec[:, idx]
|
| 131 |
+
# calculate time window power
|
| 132 |
+
tpower = np.sum(current_spectrum)
|
| 133 |
+
all_normalised_centroid_tpower.append(tpower)
|
| 134 |
+
|
| 135 |
+
# estimate if time segment contains audio energy or just noise
|
| 136 |
+
if tpower > threshold:
|
| 137 |
+
# get the spectrum
|
| 138 |
+
lower_spectrum = lp_centroid_spec[:, idx]
|
| 139 |
+
lower_power = np.sum(lower_spectrum)
|
| 140 |
+
|
| 141 |
+
# get lower centroid
|
| 142 |
+
lower_centroid = np.sum(lower_spectrum * lp_centroid_freq) / float(lower_power)
|
| 143 |
+
|
| 144 |
+
# append to list
|
| 145 |
+
all_normalised_lower_centroid.append(lower_centroid)
|
| 146 |
+
else:
|
| 147 |
+
all_normalised_lower_centroid.append(0)
|
| 148 |
+
|
| 149 |
+
# calculate the weighted mean of lower centroids
|
| 150 |
+
weighted_mean_normalised_lower_centroid = np.average(all_normalised_lower_centroid,
|
| 151 |
+
weights=all_normalised_centroid_tpower)
|
| 152 |
+
# limit to the centroid crossover frequency
|
| 153 |
+
if weighted_mean_normalised_lower_centroid > centroid_crossover_frequency:
|
| 154 |
+
limited_weighted_mean_normalised_lower_centroid = np.float64(centroid_crossover_frequency)
|
| 155 |
+
else:
|
| 156 |
+
limited_weighted_mean_normalised_lower_centroid = weighted_mean_normalised_lower_centroid
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
'''
|
| 161 |
+
METRIC 2 - weighted mean normalised lower ratio
|
| 162 |
+
'''
|
| 163 |
+
# define arrays for storing metrics
|
| 164 |
+
all_normalised_lower_ratio = []
|
| 165 |
+
all_normalised_ratio_tpower = []
|
| 166 |
+
|
| 167 |
+
# get metrics for each time segment of the spectrogram
|
| 168 |
+
for idx in range(len(time)):
|
| 169 |
+
# get time frame of broadband spectrum
|
| 170 |
+
current_spectrum = spec[:, idx]
|
| 171 |
+
tpower = np.sum(current_spectrum)
|
| 172 |
+
all_normalised_ratio_tpower.append(tpower)
|
| 173 |
+
|
| 174 |
+
# estimate if time segment contains audio energy or just noise
|
| 175 |
+
if tpower > threshold:
|
| 176 |
+
# get the lowpass spectrum
|
| 177 |
+
lower_spectrum = lp_ratio_spec[:, idx]
|
| 178 |
+
# get the power of this
|
| 179 |
+
lower_power = np.sum(lower_spectrum)
|
| 180 |
+
# get the ratio of LF to all energy
|
| 181 |
+
lower_ratio = lower_power / float(tpower)
|
| 182 |
+
# append to array
|
| 183 |
+
all_normalised_lower_ratio.append(lower_ratio)
|
| 184 |
+
else:
|
| 185 |
+
all_normalised_lower_ratio.append(0)
|
| 186 |
+
|
| 187 |
+
# calculate
|
| 188 |
+
weighted_mean_normalised_lower_ratio = np.average(all_normalised_lower_ratio, weights=all_normalised_ratio_tpower)
|
| 189 |
+
|
| 190 |
+
'''
|
| 191 |
+
METRIC 3 - Approximate duration/decay-time of sample
|
| 192 |
+
'''
|
| 193 |
+
all_my_duration = []
|
| 194 |
+
|
| 195 |
+
# get envelpe of signal
|
| 196 |
+
envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs)
|
| 197 |
+
# estimate onsets
|
| 198 |
+
onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs)
|
| 199 |
+
|
| 200 |
+
# get RMS envelope - better follows decays than the sample-and-hold
|
| 201 |
+
rms_step_size = 256
|
| 202 |
+
rms_envelope = timbral_util.calculate_rms_enveope(audio_samples, step_size=rms_step_size)
|
| 203 |
+
|
| 204 |
+
# convert decay threshold to magnitude
|
| 205 |
+
decay_threshold = timbral_util.db2mag(db_decay_threshold)
|
| 206 |
+
# rescale onsets to rms stepsize - casting to int
|
| 207 |
+
time_convert = fs / float(rms_step_size)
|
| 208 |
+
onsets = (np.array(onsets) / float(rms_step_size)).astype('int')
|
| 209 |
+
|
| 210 |
+
for idx, onset in enumerate(onsets):
|
| 211 |
+
if onset == onsets[-1]:
|
| 212 |
+
segment = rms_envelope[onset:]
|
| 213 |
+
else:
|
| 214 |
+
segment = rms_envelope[onset:onsets[idx + 1]]
|
| 215 |
+
|
| 216 |
+
# get location of max RMS frame
|
| 217 |
+
max_idx = np.argmax(segment)
|
| 218 |
+
# get the segment from this max until the next onset
|
| 219 |
+
post_max_segment = segment[max_idx:]
|
| 220 |
+
|
| 221 |
+
# estimate duration based on decay or until next onset
|
| 222 |
+
if min(post_max_segment) >= decay_threshold:
|
| 223 |
+
my_duration = len(post_max_segment) / time_convert
|
| 224 |
+
else:
|
| 225 |
+
my_duration = np.where(post_max_segment < decay_threshold)[0][0] / time_convert
|
| 226 |
+
|
| 227 |
+
# append to array
|
| 228 |
+
all_my_duration.append(my_duration)
|
| 229 |
+
|
| 230 |
+
# calculate the lof of mean duration
|
| 231 |
+
mean_my_duration = np.log10(np.mean(all_my_duration))
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
'''
|
| 235 |
+
METRIC 4 - f0 estimation with peak picking
|
| 236 |
+
'''
|
| 237 |
+
# get the overall spectrum
|
| 238 |
+
all_spectrum = np.sum(spec, axis=1)
|
| 239 |
+
# normalise this
|
| 240 |
+
norm_spec = (all_spectrum - np.min(all_spectrum)) / (np.max(all_spectrum) - np.min(all_spectrum))
|
| 241 |
+
# set limit for peak picking
|
| 242 |
+
cthr = 0.01
|
| 243 |
+
# detect peaks
|
| 244 |
+
peak_idx, peak_value, peak_freq = timbral_util.detect_peaks(norm_spec, cthr=cthr, unprocessed_array=norm_spec,
|
| 245 |
+
freq=freq)
|
| 246 |
+
# estimate peak
|
| 247 |
+
pitch_estimate = np.log10(min(peak_freq)) if peak_freq[0] > 0 else 0
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# get outputs
|
| 251 |
+
if dev_output:
|
| 252 |
+
return limited_weighted_mean_normalised_lower_centroid, weighted_mean_normalised_lower_ratio, mean_my_duration, \
|
| 253 |
+
pitch_estimate, weighted_mean_normalised_lower_ratio * mean_my_duration, \
|
| 254 |
+
timbral_util.sigmoid(weighted_mean_normalised_lower_ratio) * mean_my_duration
|
| 255 |
+
else:
|
| 256 |
+
'''
|
| 257 |
+
Perform linear regression to obtain depth
|
| 258 |
+
'''
|
| 259 |
+
# coefficients from linear regression
|
| 260 |
+
coefficients = np.array([-0.0043703565847874465, 32.83743202462131, 4.750862716905235, -14.217438690256062,
|
| 261 |
+
3.8782339862813924, -0.8544826091735516, 66.69534393444391])
|
| 262 |
+
|
| 263 |
+
# what are the best metrics
|
| 264 |
+
metric1 = limited_weighted_mean_normalised_lower_centroid
|
| 265 |
+
metric2 = weighted_mean_normalised_lower_ratio
|
| 266 |
+
metric3 = mean_my_duration
|
| 267 |
+
metric4 = pitch_estimate
|
| 268 |
+
metric5 = metric2 * metric3
|
| 269 |
+
metric6 = timbral_util.sigmoid(metric2) * metric3
|
| 270 |
+
|
| 271 |
+
# pack metrics into a matrix
|
| 272 |
+
all_metrics = np.zeros(7)
|
| 273 |
+
|
| 274 |
+
all_metrics[0] = metric1
|
| 275 |
+
all_metrics[1] = metric2
|
| 276 |
+
all_metrics[2] = metric3
|
| 277 |
+
all_metrics[3] = metric4
|
| 278 |
+
all_metrics[4] = metric5
|
| 279 |
+
all_metrics[5] = metric6
|
| 280 |
+
all_metrics[6] = 1.0
|
| 281 |
+
|
| 282 |
+
# perform linear regression
|
| 283 |
+
depth = np.sum(all_metrics * coefficients)
|
| 284 |
+
|
| 285 |
+
if clip_output:
|
| 286 |
+
depth = timbral_util.output_clip(depth)
|
| 287 |
+
|
| 288 |
+
return depth
|
| 289 |
+
|
timbral_models/Timbral_Extractor.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
import numpy as np
|
| 4 |
+
import six
|
| 5 |
+
from . import timbral_util, timbral_hardness, timbral_depth, timbral_brightness, timbral_roughness, timbral_warmth, \
|
| 6 |
+
timbral_sharpness, timbral_booming, timbral_reverb
|
| 7 |
+
|
| 8 |
+
def timbral_extractor(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, output_type='dictionary', verbose=True):
|
| 9 |
+
"""
|
| 10 |
+
The Timbral Extractor will extract all timbral attribute sin one function call, returning the results as either
|
| 11 |
+
a list or dictionary, depending on input definitions.
|
| 12 |
+
|
| 13 |
+
Version 0.4
|
| 14 |
+
|
| 15 |
+
Simply calls each function with
|
| 16 |
+
|
| 17 |
+
Required parameter
|
| 18 |
+
:param fname: string or numpy array
|
| 19 |
+
string, audio filename to be analysed, including full file path and extension.
|
| 20 |
+
numpy array, array of audio samples, requires fs to be set to the sample rate.
|
| 21 |
+
|
| 22 |
+
Optional parameters
|
| 23 |
+
:param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
|
| 24 |
+
Defaults to 0.
|
| 25 |
+
:param phase_correction: bool, perform phase checking before summing to mono. Defaults to False.
|
| 26 |
+
:param dev_output: bool, when False return the depth, when True return all extracted
|
| 27 |
+
features. Default to False.
|
| 28 |
+
:param clip_output: bool, force the output to be between 0 and 100.
|
| 29 |
+
:param output_type: string, defines the type the output should be formatted in. Accepts either
|
| 30 |
+
'dictionary' or 'list' as parameters. Default to 'dictionary'.
|
| 31 |
+
|
| 32 |
+
:return: timbre the results from all timbral attributes as either a dictionary or list, depending
|
| 33 |
+
on output_type.
|
| 34 |
+
|
| 35 |
+
Copyright 2019 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 36 |
+
"""
|
| 37 |
+
'''
|
| 38 |
+
Check output_type before calculating anything
|
| 39 |
+
'''
|
| 40 |
+
if output_type != 'dictionary' and output_type != 'list':
|
| 41 |
+
raise ValueError('output_type must be \'dictionary\' or \'list\'.')
|
| 42 |
+
|
| 43 |
+
'''
|
| 44 |
+
Basic audio reading
|
| 45 |
+
'''
|
| 46 |
+
if isinstance(fname, six.string_types):
|
| 47 |
+
# read audio file only once and pass arrays to algorithms
|
| 48 |
+
try:
|
| 49 |
+
audio_samples, fs = sf.read(fname)
|
| 50 |
+
# making an array again for copying purposes
|
| 51 |
+
multi_channel_audio = np.array(audio_samples)
|
| 52 |
+
except:
|
| 53 |
+
print('Soundfile failed to load: ' + str(fname))
|
| 54 |
+
raise TypeError('Unable to read audio file.')
|
| 55 |
+
elif hasattr(fname, 'shape'):
|
| 56 |
+
if fs==0:
|
| 57 |
+
raise ValueError('If giving function an array, \'fs\' must be specified')
|
| 58 |
+
audio_samples = fname
|
| 59 |
+
multi_channel_audio = np.array(fname)
|
| 60 |
+
else:
|
| 61 |
+
raise ValueError('Input must be either a string or a numpy array.')
|
| 62 |
+
|
| 63 |
+
# channel reduction
|
| 64 |
+
audio_samples = timbral_util.channel_reduction(audio_samples)
|
| 65 |
+
|
| 66 |
+
# resample audio file if sample rate is less than 44100
|
| 67 |
+
audio_samples, fs = timbral_util.check_upsampling(audio_samples, fs)
|
| 68 |
+
|
| 69 |
+
# functions can be given audio samples as well
|
| 70 |
+
if verbose:
|
| 71 |
+
print('Calculating hardness...')
|
| 72 |
+
hardness = timbral_hardness(audio_samples, fs=fs,
|
| 73 |
+
dev_output=dev_output,
|
| 74 |
+
phase_correction=phase_correction,
|
| 75 |
+
clip_output=clip_output)
|
| 76 |
+
if verbose:
|
| 77 |
+
print('Calculating depth...')
|
| 78 |
+
depth = timbral_depth(audio_samples, fs=fs,
|
| 79 |
+
dev_output=dev_output,
|
| 80 |
+
phase_correction=phase_correction,
|
| 81 |
+
clip_output=clip_output)
|
| 82 |
+
if verbose:
|
| 83 |
+
print('Calculating brightness...')
|
| 84 |
+
brightness = timbral_brightness(audio_samples, fs=fs,
|
| 85 |
+
dev_output=dev_output,
|
| 86 |
+
phase_correction=phase_correction,
|
| 87 |
+
clip_output=clip_output)
|
| 88 |
+
if verbose:
|
| 89 |
+
print('Calculating roughness...')
|
| 90 |
+
roughness = timbral_roughness(audio_samples, fs=fs,
|
| 91 |
+
dev_output=dev_output,
|
| 92 |
+
phase_correction=phase_correction,
|
| 93 |
+
clip_output=clip_output)
|
| 94 |
+
if verbose:
|
| 95 |
+
print('Calculating warmth...')
|
| 96 |
+
warmth = timbral_warmth(audio_samples, fs=fs,
|
| 97 |
+
dev_output=dev_output,
|
| 98 |
+
phase_correction=phase_correction,
|
| 99 |
+
clip_output=clip_output)
|
| 100 |
+
if verbose:
|
| 101 |
+
print('Calculating sharpness...')
|
| 102 |
+
sharpness = timbral_sharpness(audio_samples, fs=fs,
|
| 103 |
+
dev_output=dev_output,
|
| 104 |
+
phase_correction=phase_correction,
|
| 105 |
+
clip_output=clip_output)
|
| 106 |
+
if verbose:
|
| 107 |
+
print('Calculating boominess...')
|
| 108 |
+
boominess = timbral_booming(audio_samples, fs=fs,
|
| 109 |
+
dev_output=dev_output,
|
| 110 |
+
phase_correction=phase_correction,
|
| 111 |
+
clip_output=clip_output)
|
| 112 |
+
if verbose:
|
| 113 |
+
print('Calculating reverb...')
|
| 114 |
+
# reverb calculated on all channels
|
| 115 |
+
reverb = timbral_reverb(multi_channel_audio, fs=fs)
|
| 116 |
+
|
| 117 |
+
'''
|
| 118 |
+
Format output
|
| 119 |
+
'''
|
| 120 |
+
if output_type=='dictionary':
|
| 121 |
+
timbre = {
|
| 122 |
+
'hardness': hardness,
|
| 123 |
+
'depth': depth,
|
| 124 |
+
'brightness': brightness,
|
| 125 |
+
'roughness': roughness,
|
| 126 |
+
'warmth': warmth,
|
| 127 |
+
'sharpness': sharpness,
|
| 128 |
+
'boominess': boominess,
|
| 129 |
+
'reverb': reverb
|
| 130 |
+
}
|
| 131 |
+
elif output_type == 'list':
|
| 132 |
+
timbre = [hardness, depth, brightness, roughness, warmth, sharpness, boominess, reverb]
|
| 133 |
+
else:
|
| 134 |
+
raise ValueError('output_type must be \'dictionary\' or \'list\'.')
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
return timbre
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
timbral_models/Timbral_Hardness.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
import six
|
| 6 |
+
from scipy.signal import spectrogram
|
| 7 |
+
from . import timbral_util
|
| 8 |
+
|
| 9 |
+
def timbral_hardness(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, max_attack_time=0.1,
|
| 10 |
+
bandwidth_thresh_db=-75):
|
| 11 |
+
"""
|
| 12 |
+
This function calculates the apparent hardness of an audio file.
|
| 13 |
+
This version of timbral_hardness contains self loudness normalising methods and can accept arrays as an input
|
| 14 |
+
instead of a string filename.
|
| 15 |
+
|
| 16 |
+
Version 0.4
|
| 17 |
+
|
| 18 |
+
Required parameter
|
| 19 |
+
:param fname: string or numpy array
|
| 20 |
+
string, audio filename to be analysed, including full file path and extension.
|
| 21 |
+
numpy array, array of audio samples, requires fs to be set to the sample rate.
|
| 22 |
+
|
| 23 |
+
Optional parameters
|
| 24 |
+
:param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
|
| 25 |
+
Defaults to 0.
|
| 26 |
+
:param phase_correction: bool, perform phase checking before summing to mono. Defaults to False.
|
| 27 |
+
:param dev_output: bool, when False return the depth, when True return all extracted
|
| 28 |
+
features. Default to False.
|
| 29 |
+
:param clip_output: bool, force the output to be between 0 and 100.
|
| 30 |
+
:param max_attack_time: float, set the maximum attack time, in seconds. Defaults to 0.1.
|
| 31 |
+
:param bandwidth_thresh_db: float, set the threshold for calculating the bandwidth, Defaults to -75dB.
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
:return: float, Apparent hardness of audio file, float (dev_output = False/default).
|
| 35 |
+
With dev_output set to True returns the weighted mean bandwidth,
|
| 36 |
+
mean attack time, harmonic-percussive ratio, and unitless attack centroid.
|
| 37 |
+
|
| 38 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 39 |
+
|
| 40 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 41 |
+
you may not use this file except in compliance with the License.
|
| 42 |
+
You may obtain a copy of the License at
|
| 43 |
+
|
| 44 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 45 |
+
|
| 46 |
+
Unless required by applicable law or agreed to in writing, software
|
| 47 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 48 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 49 |
+
See the License for the specific language governing permissions and
|
| 50 |
+
limitations under the License.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
'''
|
| 54 |
+
Read input
|
| 55 |
+
'''
|
| 56 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 57 |
+
|
| 58 |
+
'''
|
| 59 |
+
Calculate the midband level
|
| 60 |
+
'''
|
| 61 |
+
# get the level in the midband
|
| 62 |
+
midband_level, weighed_midband_level = timbral_util.weighted_bark_level(audio_samples, fs, low_bark_band=70,
|
| 63 |
+
upper_bark_band=140)
|
| 64 |
+
log_weighted_midband_level = np.log10(weighed_midband_level)
|
| 65 |
+
|
| 66 |
+
'''
|
| 67 |
+
Calculate the harmonic-percussive ratio pre zero-padding the signal
|
| 68 |
+
'''
|
| 69 |
+
HP_ratio = timbral_util.get_percussive_audio(audio_samples, return_ratio=True)
|
| 70 |
+
log_HP_ratio = np.log10(HP_ratio)
|
| 71 |
+
|
| 72 |
+
'''
|
| 73 |
+
Zeropad the signal
|
| 74 |
+
'''
|
| 75 |
+
# zero pad the signal
|
| 76 |
+
nperseg = 4096 # default value for spectrogram analysis
|
| 77 |
+
audio_samples = np.lib.pad(audio_samples, (nperseg+1, 0), 'constant', constant_values=(0.0, 0.0))
|
| 78 |
+
|
| 79 |
+
'''
|
| 80 |
+
Calculate the envelope and onsets
|
| 81 |
+
'''
|
| 82 |
+
# calculate the envelope of the signal
|
| 83 |
+
envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.1)
|
| 84 |
+
envelope_time = np.arange(len(envelope)) / fs
|
| 85 |
+
|
| 86 |
+
# calculate the onsets
|
| 87 |
+
original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg)
|
| 88 |
+
onset_strength = librosa.onset.onset_strength(y=audio_samples, sr=fs)
|
| 89 |
+
# If onsets don't exist, set it to time zero
|
| 90 |
+
if not original_onsets:
|
| 91 |
+
original_onsets = [0]
|
| 92 |
+
# set to start of file in the case where there is only one onset
|
| 93 |
+
if len(original_onsets) == 1:
|
| 94 |
+
original_onsets = [0]
|
| 95 |
+
|
| 96 |
+
onsets = np.array(original_onsets) - nperseg
|
| 97 |
+
onsets[onsets < 0] = 0
|
| 98 |
+
|
| 99 |
+
'''
|
| 100 |
+
Calculate the spectrogram so that the bandwidth can be created
|
| 101 |
+
'''
|
| 102 |
+
bandwidth_step_size = 128
|
| 103 |
+
mag = timbral_util.db2mag(bandwidth_thresh_db) # calculate threshold in linear from dB
|
| 104 |
+
bandwidth, t, f = timbral_util.get_bandwidth_array(audio_samples, fs, nperseg=nperseg,
|
| 105 |
+
overlap_step=bandwidth_step_size, rolloff_thresh=mag,
|
| 106 |
+
normalisation_method='none')
|
| 107 |
+
# bandwidth sample rate
|
| 108 |
+
bandwidth_fs = fs / float(bandwidth_step_size) # fs due to spectrogram step size
|
| 109 |
+
|
| 110 |
+
'''
|
| 111 |
+
Set all parameters for holding data per onset
|
| 112 |
+
'''
|
| 113 |
+
all_bandwidth_max = []
|
| 114 |
+
all_attack_time = []
|
| 115 |
+
all_max_strength = []
|
| 116 |
+
all_max_strength_bandwidth = []
|
| 117 |
+
all_attack_centroid = []
|
| 118 |
+
|
| 119 |
+
'''
|
| 120 |
+
Get bandwidth onset times and max bandwidth
|
| 121 |
+
'''
|
| 122 |
+
bandwidth_onset = np.array(onsets / float(bandwidth_step_size)).astype('int') # overlap_step=128
|
| 123 |
+
|
| 124 |
+
'''
|
| 125 |
+
Iterate through onsets and calculate metrics for each
|
| 126 |
+
'''
|
| 127 |
+
for onset_count in range(len(bandwidth_onset)):
|
| 128 |
+
'''
|
| 129 |
+
Calculate the bandwidth max for the attack portion of the onset
|
| 130 |
+
'''
|
| 131 |
+
# get the section of the bandwidth array between onsets
|
| 132 |
+
onset = bandwidth_onset[onset_count]
|
| 133 |
+
if onset == bandwidth_onset[-1]:
|
| 134 |
+
bandwidth_seg = np.array(bandwidth[onset:])
|
| 135 |
+
else:
|
| 136 |
+
next_onset = bandwidth_onset[onset_count + 1]
|
| 137 |
+
bandwidth_seg = np.array(bandwidth[onset:next_onset])
|
| 138 |
+
|
| 139 |
+
if max(bandwidth_seg) > 0:
|
| 140 |
+
# making a copy of the bandqwidth segment to avoid array changes
|
| 141 |
+
hold_bandwidth_seg = list(bandwidth_seg)
|
| 142 |
+
|
| 143 |
+
# calculate onset of the attack in the bandwidth array
|
| 144 |
+
if max(bandwidth_seg) > 0:
|
| 145 |
+
bandwidth_attack = timbral_util.calculate_attack_time(bandwidth_seg, bandwidth_fs,
|
| 146 |
+
calculation_type='fixed_threshold',
|
| 147 |
+
max_attack_time=max_attack_time)
|
| 148 |
+
else:
|
| 149 |
+
bandwidth_attack = []
|
| 150 |
+
|
| 151 |
+
# calculate the badiwdth max for the attack portion
|
| 152 |
+
if bandwidth_attack:
|
| 153 |
+
start_idx = bandwidth_attack[2]
|
| 154 |
+
if max_attack_time > 0:
|
| 155 |
+
max_attack_time_samples = int(max_attack_time * bandwidth_fs)
|
| 156 |
+
if len(hold_bandwidth_seg[start_idx:]) > start_idx+max_attack_time_samples:
|
| 157 |
+
all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:start_idx+max_attack_time_samples]))
|
| 158 |
+
else:
|
| 159 |
+
all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:]))
|
| 160 |
+
else:
|
| 161 |
+
all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:]))
|
| 162 |
+
else:
|
| 163 |
+
# set as blank so bandwith
|
| 164 |
+
bandwidth_attack = []
|
| 165 |
+
|
| 166 |
+
'''
|
| 167 |
+
Calculate the attack time
|
| 168 |
+
'''
|
| 169 |
+
onset = original_onsets[onset_count]
|
| 170 |
+
if onset == original_onsets[-1]:
|
| 171 |
+
attack_seg = np.array(envelope[onset:])
|
| 172 |
+
strength_seg = np.array(onset_strength[int(onset/512):]) # 512 is librosa default window size
|
| 173 |
+
audio_seg = np.array(audio_samples[onset:])
|
| 174 |
+
else:
|
| 175 |
+
attack_seg = np.array(envelope[onset:original_onsets[onset_count + 1]])
|
| 176 |
+
strength_seg = np.array(onset_strength[int(onset/512):int(original_onsets[onset_count+1]/512)])
|
| 177 |
+
audio_seg = np.array(audio_samples[onset:original_onsets[onset_count + 1]])
|
| 178 |
+
|
| 179 |
+
attack_time = timbral_util.calculate_attack_time(attack_seg, fs, max_attack_time=max_attack_time)
|
| 180 |
+
all_attack_time.append(attack_time[0])
|
| 181 |
+
|
| 182 |
+
'''
|
| 183 |
+
Get the attack strength for weighting the bandwidth max
|
| 184 |
+
'''
|
| 185 |
+
all_max_strength.append(max(strength_seg))
|
| 186 |
+
if bandwidth_attack:
|
| 187 |
+
all_max_strength_bandwidth.append(max(strength_seg))
|
| 188 |
+
|
| 189 |
+
'''
|
| 190 |
+
Get the spectral centroid of the attack (125ms after attack start)
|
| 191 |
+
'''
|
| 192 |
+
# identify the start of the attack
|
| 193 |
+
th_start_idx = attack_time[2]
|
| 194 |
+
# define how long the attack time can be
|
| 195 |
+
centroid_int_samples = int(0.125 * fs) # number of samples for attack time integration
|
| 196 |
+
|
| 197 |
+
# start of attack section from attack time calculation
|
| 198 |
+
if th_start_idx + centroid_int_samples >= len(audio_seg):
|
| 199 |
+
audio_seg = audio_seg[th_start_idx:]
|
| 200 |
+
else:
|
| 201 |
+
audio_seg = audio_seg[th_start_idx:th_start_idx + centroid_int_samples]
|
| 202 |
+
|
| 203 |
+
# check that there's a suitable legnth of samples to get attack centroid
|
| 204 |
+
# minimum length arbitrarily set to 512 samples
|
| 205 |
+
if len(audio_seg) > 512:
|
| 206 |
+
# get all spectral features for this attack section
|
| 207 |
+
spectral_features_hold = timbral_util.get_spectral_features(audio_seg, fs)
|
| 208 |
+
|
| 209 |
+
# store unitless attack centroid if exists
|
| 210 |
+
if spectral_features_hold:
|
| 211 |
+
all_attack_centroid.append(spectral_features_hold[0])
|
| 212 |
+
|
| 213 |
+
'''
|
| 214 |
+
Calculate mean and weighted average values for features
|
| 215 |
+
'''
|
| 216 |
+
# attack time
|
| 217 |
+
mean_attack_time = np.mean(all_attack_time)
|
| 218 |
+
|
| 219 |
+
# get the weighted mean of bandwidth max and limit lower value
|
| 220 |
+
if len(all_bandwidth_max):
|
| 221 |
+
mean_weighted_bandwidth_max = np.average(all_bandwidth_max, weights=all_max_strength_bandwidth)
|
| 222 |
+
# check for zero values so the log bandwidth max can be taken
|
| 223 |
+
if mean_weighted_bandwidth_max <= 512.0:
|
| 224 |
+
mean_weighted_bandwidth_max = fs / 512.0 # minimum value
|
| 225 |
+
else:
|
| 226 |
+
mean_weighted_bandwidth_max = fs / 512.0 # minimum value
|
| 227 |
+
|
| 228 |
+
# take the logarithm
|
| 229 |
+
log_weighted_bandwidth_max = np.log10(mean_weighted_bandwidth_max)
|
| 230 |
+
|
| 231 |
+
# get the mean of the onset strenths
|
| 232 |
+
mean_max_strength = np.mean(all_max_strength)
|
| 233 |
+
log_mean_max_strength = np.log10(mean_max_strength)
|
| 234 |
+
|
| 235 |
+
if all_attack_centroid:
|
| 236 |
+
mean_attack_centroid = np.mean(all_attack_centroid)
|
| 237 |
+
else:
|
| 238 |
+
mean_attack_centroid = 200.0
|
| 239 |
+
|
| 240 |
+
# limit the lower limit of the attack centroid to allow for log to be taken
|
| 241 |
+
if mean_attack_centroid <= 200:
|
| 242 |
+
mean_attack_centroid = 200.0
|
| 243 |
+
log_attack_centroid = np.log10(mean_attack_centroid)
|
| 244 |
+
|
| 245 |
+
'''
|
| 246 |
+
Either return the raw features, or calculaste the linear regression.
|
| 247 |
+
'''
|
| 248 |
+
if dev_output:
|
| 249 |
+
return log_weighted_bandwidth_max, log_attack_centroid, log_weighted_midband_level, log_HP_ratio, log_mean_max_strength, mean_attack_time
|
| 250 |
+
else:
|
| 251 |
+
'''
|
| 252 |
+
Apply regression model
|
| 253 |
+
'''
|
| 254 |
+
all_metrics = np.ones(7)
|
| 255 |
+
all_metrics[0] = log_weighted_bandwidth_max
|
| 256 |
+
all_metrics[1] = log_attack_centroid
|
| 257 |
+
all_metrics[2] = log_weighted_midband_level
|
| 258 |
+
all_metrics[3] = log_HP_ratio
|
| 259 |
+
all_metrics[4] = log_mean_max_strength
|
| 260 |
+
all_metrics[5] = mean_attack_time
|
| 261 |
+
|
| 262 |
+
# coefficients = np.array([13.5330599736, 18.1519030059, 13.1679266873, 5.03134507433, 5.22582123237, -3.71046018962, -89.8935449357])
|
| 263 |
+
|
| 264 |
+
# recalculated values when using loudnorm
|
| 265 |
+
coefficients = np.array([12.079781720638145, 18.52100377170042, 14.139883645260355, 5.567690321917516,
|
| 266 |
+
3.9346817690405635, -4.326890461087848, -85.60352209068202])
|
| 267 |
+
|
| 268 |
+
hardness = np.sum(all_metrics * coefficients)
|
| 269 |
+
|
| 270 |
+
# clip output between 0 and 100
|
| 271 |
+
if clip_output:
|
| 272 |
+
hardness = timbral_util.output_clip(hardness)
|
| 273 |
+
|
| 274 |
+
return hardness
|
timbral_models/Timbral_Reverb.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
import six
|
| 5 |
+
from scipy.signal import spectrogram
|
| 6 |
+
from . import timbral_util
|
| 7 |
+
|
| 8 |
+
def timbral_reverb(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False):
|
| 9 |
+
"""
|
| 10 |
+
This function classifies the audio file as either not sounding reverberant.
|
| 11 |
+
|
| 12 |
+
This is based on the RT60 estimation algirhtm documented in:
|
| 13 |
+
Jan, T., and Wang, W., 2012: "Blind reverberation time estimation based on Laplace distribution",
|
| 14 |
+
EUSIPCO. pp. 2050-2054, Bucharest, Romania.
|
| 15 |
+
|
| 16 |
+
Version 0.4
|
| 17 |
+
|
| 18 |
+
Required parameter
|
| 19 |
+
:param fname: string or numpy array
|
| 20 |
+
string, audio filename to be analysed, including full file path and extension.
|
| 21 |
+
numpy array, array of audio samples, requires fs to be set to the sample rate.
|
| 22 |
+
|
| 23 |
+
Optional parameters
|
| 24 |
+
:param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
|
| 25 |
+
Defaults to 0.
|
| 26 |
+
:param phase_correction: Has no effect on the code. Implemented for consistency with other timbral
|
| 27 |
+
functions.
|
| 28 |
+
:param dev_output: Has no effect on the code. Implemented for consistency with other timbral
|
| 29 |
+
functions.
|
| 30 |
+
:param clip_output: Has no effect on the code. Implemented for consistency with other timbral
|
| 31 |
+
functions.
|
| 32 |
+
|
| 33 |
+
:return: predicted reverb of audio file. 1 represents the files osunds reverberant, 0
|
| 34 |
+
represents the files does not sound reverberant.
|
| 35 |
+
|
| 36 |
+
Copyright 2019 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 37 |
+
"""
|
| 38 |
+
# needs to accept the input as audio file
|
| 39 |
+
raw_audio_samples, fs = timbral_util.file_read(fname, fs=fs, phase_correction=False, mono_sum=False, loudnorm=False)
|
| 40 |
+
|
| 41 |
+
# check for mono file
|
| 42 |
+
if len(raw_audio_samples.shape) < 2:
|
| 43 |
+
# it's a mono file
|
| 44 |
+
mean_RT60 = estimate_RT60(raw_audio_samples, fs)
|
| 45 |
+
else:
|
| 46 |
+
# the file has channels, estimate RT for the first two and take the mean
|
| 47 |
+
l_RT60 = estimate_RT60(raw_audio_samples[:, 0], fs)
|
| 48 |
+
r_RT60 = estimate_RT60(raw_audio_samples[:, 1], fs)
|
| 49 |
+
|
| 50 |
+
mean_RT60 = np.mean([l_RT60, r_RT60])
|
| 51 |
+
|
| 52 |
+
'''
|
| 53 |
+
need to develop a logistic regression model to test this.
|
| 54 |
+
'''
|
| 55 |
+
probability = reverb_logistic_regression(mean_RT60)
|
| 56 |
+
|
| 57 |
+
if dev_output:
|
| 58 |
+
return mean_RT60, probability
|
| 59 |
+
else:
|
| 60 |
+
if probability < 0.5:
|
| 61 |
+
return 0
|
| 62 |
+
else:
|
| 63 |
+
return 1
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def estimate_RT60(audio_samples, fs):
|
| 67 |
+
|
| 68 |
+
''' No chanel rediuction, perform on each channel '''
|
| 69 |
+
|
| 70 |
+
# function[rt_est, par] = RT_estimation_my(y, fs)
|
| 71 |
+
|
| 72 |
+
# performs blind RT estimation
|
| 73 |
+
# INPUT
|
| 74 |
+
# y: reverberant speech
|
| 75 |
+
# fs: sampling frequency
|
| 76 |
+
#
|
| 77 |
+
# OUTPUT
|
| 78 |
+
# rt_est: estimated RT
|
| 79 |
+
# par: struct with parameters used to execute the function
|
| 80 |
+
# rt_estimate_frame_my.m
|
| 81 |
+
#
|
| 82 |
+
# Codes were adapted from the original codes by Heinrich Loellmann, IND, RWTH Aachen
|
| 83 |
+
#
|
| 84 |
+
# Authors: Tariqullah Jan, moderated by Wenwu Wang, University of Surrey(2012)
|
| 85 |
+
|
| 86 |
+
'''
|
| 87 |
+
Initialisation
|
| 88 |
+
'''
|
| 89 |
+
# ---------------------------------------------
|
| 90 |
+
|
| 91 |
+
par = init_rt_estimate_e(fs) # struct with all parameters and buffers for frame-wise processing
|
| 92 |
+
BL = par['N'] * par['down'] # to simplify notation
|
| 93 |
+
|
| 94 |
+
Laudio = len(audio_samples)
|
| 95 |
+
|
| 96 |
+
# check audio file is long enough for analysis
|
| 97 |
+
if BL < Laudio:
|
| 98 |
+
rt_est = [] #np.zeros(int(round(Laudio / par['N_shift'])))
|
| 99 |
+
RT_final = []
|
| 100 |
+
|
| 101 |
+
'''
|
| 102 |
+
frame-wise processing in the time-domain
|
| 103 |
+
'''
|
| 104 |
+
# ---------------------------------------------
|
| 105 |
+
|
| 106 |
+
k = 0
|
| 107 |
+
n_array = np.arange(0, Laudio - BL + 1, par['N_shift'])
|
| 108 |
+
for n in n_array:
|
| 109 |
+
k += 1 # frame counter
|
| 110 |
+
ind = np.arange(n, n + BL) # indices of current frame
|
| 111 |
+
|
| 112 |
+
# Actual RT estimation
|
| 113 |
+
RT, par, finalrt = rt_estimate_frame_my(audio_samples[ind[np.arange(0, len(ind), par['down'])]], par)
|
| 114 |
+
|
| 115 |
+
rt_est.append(RT) # store estimated value
|
| 116 |
+
RT_final.append(finalrt)
|
| 117 |
+
else:
|
| 118 |
+
# audio too short for analysis, for returning smallest Rt value
|
| 119 |
+
return par['Tquant'][0]
|
| 120 |
+
|
| 121 |
+
RT_final = np.clip(RT_final, 0, max(RT_final))
|
| 122 |
+
aaa = RT_final[np.where(RT_final>0)]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
RT_temp_new = []
|
| 126 |
+
for i in range(1, len(aaa)):
|
| 127 |
+
RT_temp_new.append(0.49 * aaa[i - 1] + (1 - 0.49) * np.max(aaa))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if aaa.size:
|
| 131 |
+
RTfinal_value = np.min(aaa)
|
| 132 |
+
|
| 133 |
+
RT_temp_new = []
|
| 134 |
+
for i in range(1, len(aaa)):
|
| 135 |
+
RT_temp_new.append(0.49 * aaa[i - 1] + (1 - 0.49) * np.max(aaa))
|
| 136 |
+
|
| 137 |
+
else:
|
| 138 |
+
RTfinal_value = par['Tquant'][0]
|
| 139 |
+
|
| 140 |
+
rt_est = np.array(rt_est)
|
| 141 |
+
rt_est = rt_est[np.where(RT_final>0)]
|
| 142 |
+
if rt_est.size:
|
| 143 |
+
return np.mean(rt_est)
|
| 144 |
+
else:
|
| 145 |
+
return par['Tquant'][0]
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def init_rt_estimate_e(fs=24000):
|
| 149 |
+
'''
|
| 150 |
+
par = init_rt_estimate_e(fs)
|
| 151 |
+
executes initialization for the function
|
| 152 |
+
rt_estimate_frame.m to perform a blind estimation of the reverberation time
|
| 153 |
+
(RT) by frame-wise processing in the time-domain.
|
| 154 |
+
|
| 155 |
+
INPUT
|
| 156 |
+
fs: sampling frequency(default=24 kHz)
|
| 157 |
+
|
| 158 |
+
OUTPUT
|
| 159 |
+
par: struct containing all parameters and buffer for executing the
|
| 160 |
+
function rt_estimate_frame.m
|
| 161 |
+
|
| 162 |
+
author: Heiner Loellmann, IND, RWTH Aachen University
|
| 163 |
+
|
| 164 |
+
created: August 2011
|
| 165 |
+
|
| 166 |
+
general paraemters
|
| 167 |
+
'''
|
| 168 |
+
par = {"fs":fs}
|
| 169 |
+
no = par['fs'] / 24000.0 # correction factor to account for different sampling frequency
|
| 170 |
+
|
| 171 |
+
# pararmeters for pre - selection of suitable segments
|
| 172 |
+
if par['fs'] > 8e3:
|
| 173 |
+
par['down'] = 2 # rate for downsampling applied before RT estimation to reduce computational complexity
|
| 174 |
+
else:
|
| 175 |
+
par['down'] = 1
|
| 176 |
+
|
| 177 |
+
par['N_sub'] = int(round(no * 700 / par['down'])) # sub-frame length(after downsampling)
|
| 178 |
+
par['N_shift'] = int(round(no * 200 / par['down'])) # frame shift(before downsampling)
|
| 179 |
+
par['nos_min'] = 3 # minimal number of subframes to detect a sound decay
|
| 180 |
+
par['nos_max'] = 7 # maximal number of subframes to detect a sound decay
|
| 181 |
+
par['N'] = int(par['nos_max'] * par['N_sub']) # maximal frame length(after downsampling)
|
| 182 |
+
|
| 183 |
+
# parameters for ML - estimation
|
| 184 |
+
Tmax = 1.1 # max RT being considered
|
| 185 |
+
Tmin = 0.2 #min RT being considered
|
| 186 |
+
par['bin'] = 0.1 # step-size for RT estimation
|
| 187 |
+
par['Tquant'] = np.arange(Tmin, Tmax+par['bin']/2, par['bin']) # set of qunatized RTs considered for maximum search
|
| 188 |
+
par['a'] = np.exp(-3.0 * np.log(10) / ( par['Tquant'] * (par['fs'] / par['down']))) # corresponding decay rate factors
|
| 189 |
+
par['La'] = len(par['a']) # num of considered decay rate factors( = no of.RTs)
|
| 190 |
+
|
| 191 |
+
# paramters for histogram - based approach to reduce outliers (order statistics)
|
| 192 |
+
par['buffer_size'] = int(round(no * 800 / par['down'])) # buffer size
|
| 193 |
+
par['buffer'] = np.zeros(par['buffer_size']) # buffer with previous indices to update histogram
|
| 194 |
+
par['no_bins'] = int(par['La']) # no. of histogram bins
|
| 195 |
+
par['hist_limits'] = np.arange(Tmin - par['bin'] / 2.0, Tmax + par['bin'], par['bin']) # limits of histogram bins
|
| 196 |
+
par['hist_rt'] = np.zeros(par['no_bins']) # histogram with ML estimates
|
| 197 |
+
par['hist_counter'] = 0 # counter increased if histogram is updated
|
| 198 |
+
|
| 199 |
+
# paramters for recursive smoothing of final RT estimate
|
| 200 |
+
par['alpha'] = 0.995 # smoothing factor
|
| 201 |
+
par['RT_initial'] = 0.3 # initial RT estimate
|
| 202 |
+
par['RT_last'] = par['RT_initial'] # last RT estimate
|
| 203 |
+
par['RT_raw'] = par['RT_initial'] # raw RT estimate obtained by histogram - approach
|
| 204 |
+
|
| 205 |
+
return par
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def rt_estimate_frame_my(frame, par):
|
| 209 |
+
'''
|
| 210 |
+
performs an efficient blind estimation of the reverberation time(RT) for frame-wise
|
| 211 |
+
processing based on Laplacian distribution.
|
| 212 |
+
|
| 213 |
+
INPUT
|
| 214 |
+
frame: (time-domain) segment with reverberant speech
|
| 215 |
+
par: struct with all parameters and buffers created by the function
|
| 216 |
+
init_binaural_speech_enhancement_e.m
|
| 217 |
+
|
| 218 |
+
OUTPUT
|
| 219 |
+
RT: estimated RT
|
| 220 |
+
par: struct with updated buffers to enable a frame-wise processing
|
| 221 |
+
RT_pre: raw RT estimate(for debugging and analysis of the algorithm)
|
| 222 |
+
|
| 223 |
+
Reference: LAllmann, H.W., Jeub, M., Yilmaz, E., and Vary, P.:
|
| 224 |
+
An Improved Algorithm for Blind Reverberation Time Estimation, a
|
| 225 |
+
International Workshop on Acoustic Echo and Noise Control(IWAENC), Tel Aviv, Israel, Aug. 2010.
|
| 226 |
+
|
| 227 |
+
Tariqullah Jan and Wenwu Wang:
|
| 228 |
+
Blind reverberation time estimation based on Laplacian distribution
|
| 229 |
+
European Signal Processing Conference(EUSIPCO), 2012.
|
| 230 |
+
|
| 231 |
+
The codes were adapted based on the original codes by Heinrich Loellmann, IND, RWTH Aachen
|
| 232 |
+
|
| 233 |
+
Authors: Tariqullah Jan, moderated by Wenwu Wang, University of Surrey(2012)
|
| 234 |
+
'''
|
| 235 |
+
if len(np.shape(np.squeeze(frame))) > 1:
|
| 236 |
+
raise ValueError('Something went wrong...')
|
| 237 |
+
|
| 238 |
+
cnt = 0 # sub-frame counter for pre - selection of possible sound decay
|
| 239 |
+
RTml = -1 # default RT estimate (-1 indicates no new RT estimate)
|
| 240 |
+
|
| 241 |
+
# calculate variance, minimum and maximum of first sub-frame
|
| 242 |
+
seg = frame[:par['N_sub']]
|
| 243 |
+
|
| 244 |
+
var_pre = np.var(seg)
|
| 245 |
+
min_pre = np.min(seg)
|
| 246 |
+
max_pre = np.max(seg)
|
| 247 |
+
|
| 248 |
+
for k in range(2, par['nos_max']):
|
| 249 |
+
# calculate variance, minimum and maximum of succeding sub-frame
|
| 250 |
+
seg = frame[(k-1) * par['N_sub'] : k * par['N_sub']+1]
|
| 251 |
+
var_cur = np.var(seg)
|
| 252 |
+
max_cur = max(seg)
|
| 253 |
+
min_cur = min(seg)
|
| 254 |
+
|
| 255 |
+
#-- Pre-Selection of suitable speech decays --------------------
|
| 256 |
+
if (var_pre > var_cur) and (max_pre > max_cur) and (min_pre < min_cur):
|
| 257 |
+
# if variance, maximum decraease, and minimum increase
|
| 258 |
+
# = > possible sound decay detected
|
| 259 |
+
|
| 260 |
+
cnt += 1
|
| 261 |
+
|
| 262 |
+
# current values becomes previous values
|
| 263 |
+
var_pre = var_cur
|
| 264 |
+
max_pre = max_cur
|
| 265 |
+
min_pre = min_cur
|
| 266 |
+
|
| 267 |
+
else:
|
| 268 |
+
if cnt >= par['nos_min']:
|
| 269 |
+
# minimum length for assumed sound decay achieved?
|
| 270 |
+
# -- Maximum Likelihood(ML) Estimation of the RT
|
| 271 |
+
RTml, _ = max_loglf(frame[:cnt*par['N_sub']], par['a'], par['Tquant'])
|
| 272 |
+
|
| 273 |
+
break
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
if k == par['nos_max']:
|
| 277 |
+
# maximum frame length achieved?
|
| 278 |
+
RTml, _ = max_loglf(frame[0:cnt * par['N_sub']], par['a'], par['Tquant'])
|
| 279 |
+
|
| 280 |
+
# end of sub-frame loop
|
| 281 |
+
|
| 282 |
+
if RTml >= 0: # new ML estimate calculated
|
| 283 |
+
|
| 284 |
+
# apply order statistics to reduce outliers
|
| 285 |
+
par['hist_counter'] += 1
|
| 286 |
+
|
| 287 |
+
for i in range(par['no_bins']):
|
| 288 |
+
|
| 289 |
+
# find index corresponding to the ML estimate
|
| 290 |
+
# find index corresponding to the ML estimate
|
| 291 |
+
if (RTml >= par['hist_limits'][i]) and (RTml <= par['hist_limits'][i+1]):
|
| 292 |
+
|
| 293 |
+
index = i
|
| 294 |
+
break
|
| 295 |
+
|
| 296 |
+
# update histogram with ML estimates for the RT
|
| 297 |
+
par['hist_rt'][index] += 1
|
| 298 |
+
|
| 299 |
+
if par['hist_counter'] > par['buffer_size'] + 1:
|
| 300 |
+
# remove old values from histogram
|
| 301 |
+
par['hist_rt'][int(par['buffer'][0])] = par['hist_rt'][int(par['buffer'][0])] - 1
|
| 302 |
+
|
| 303 |
+
par['buffer'] = np.append(par['buffer'][1:], index) # % update buffer with indices
|
| 304 |
+
idx = np.argmax(par['hist_rt']) # find index for maximum of the histogram
|
| 305 |
+
|
| 306 |
+
par['RT_raw'] = par['Tquant'][idx] # map index to RT value
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# final RT estimate obtained by recursive smoothing
|
| 310 |
+
RT = par['alpha'] * par['RT_last'] + (1 - par['alpha']) * par['RT_raw']
|
| 311 |
+
par['RT_last'] = RT
|
| 312 |
+
|
| 313 |
+
RT_pre = RTml # intermediate ML estimate for later analysis
|
| 314 |
+
|
| 315 |
+
return RT, par, RT_pre
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def max_loglf(h, a, Tquant):
|
| 319 |
+
'''
|
| 320 |
+
[ML, ll] = max_loglf(h, a, Tquant)
|
| 321 |
+
|
| 322 |
+
returns the maximum of the log-likelihood(LL) function and the LL
|
| 323 |
+
function itself for a finite set of decay rates
|
| 324 |
+
|
| 325 |
+
INPUT
|
| 326 |
+
h: input frame
|
| 327 |
+
a: finite set of values for which the max.should be found
|
| 328 |
+
T: corresponding RT values for vector a
|
| 329 |
+
|
| 330 |
+
OUTPUT
|
| 331 |
+
ML: ML estimate for the RT
|
| 332 |
+
ll: underlying LL - function
|
| 333 |
+
'''
|
| 334 |
+
|
| 335 |
+
N = len(h)
|
| 336 |
+
n = np.arange(0, N) # indices for input vector
|
| 337 |
+
ll = np.zeros(len(a))
|
| 338 |
+
|
| 339 |
+
# transpose?
|
| 340 |
+
h_square = h.transpose()
|
| 341 |
+
|
| 342 |
+
for i in range(len(a)):
|
| 343 |
+
sum1 = np.dot((a[i] ** (-1.0 * n)), np.abs(h_square))
|
| 344 |
+
sum2 = np.sum(np.abs(h_square))
|
| 345 |
+
sigma = (1 / N) * sum1
|
| 346 |
+
ll[i] = -N * np.log(2) - N * np.log(sigma) - np.sum(np.log(a[i] ** n)) - (1 / sigma) * sum1
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
idx = np.argmax(ll) # maximum of the log-likelihood function
|
| 350 |
+
ML = Tquant[idx] # corresponding ML estimate for the RT
|
| 351 |
+
|
| 352 |
+
return ML, ll
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def reverb_logistic_regression(mean_RT60):
|
| 356 |
+
"""
|
| 357 |
+
Logistic regression function to determine if the file sound reverberant or not.
|
| 358 |
+
:param mean_RT60:
|
| 359 |
+
:return:
|
| 360 |
+
"""
|
| 361 |
+
# apply linear coefficients
|
| 362 |
+
coefficients = [2.97126461]
|
| 363 |
+
intercept = -1.45082989
|
| 364 |
+
attributes = [mean_RT60]
|
| 365 |
+
logit_model = np.sum(np.array(coefficients) * np.array(attributes)) + intercept
|
| 366 |
+
|
| 367 |
+
# apply inverse of Logit function to obtain probability
|
| 368 |
+
probability = np.exp(logit_model) / (1.0 + np.exp(logit_model))
|
| 369 |
+
|
| 370 |
+
return probability
|
timbral_models/Timbral_Roughness.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from . import timbral_util
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def plomp(f1, f2):
|
| 8 |
+
"""
|
| 9 |
+
Plomp's algorithm for estimating roughness.
|
| 10 |
+
|
| 11 |
+
:param f1: float, frequency of first frequency of the pair
|
| 12 |
+
:param f2: float, frequency of second frequency of the pair
|
| 13 |
+
:return:
|
| 14 |
+
"""
|
| 15 |
+
b1 = 3.51
|
| 16 |
+
b2 = 5.75
|
| 17 |
+
xstar = 0.24
|
| 18 |
+
s1 = 0.0207
|
| 19 |
+
s2 = 18.96
|
| 20 |
+
s = np.tril(xstar / ((s1 * np.minimum(f1, f2)) + s2))
|
| 21 |
+
pd = np.exp(-b1 * s * np.abs(f2 - f1)) - np.exp(-b2 * s * np.abs(f2 - f1))
|
| 22 |
+
return pd
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def timbral_roughness(fname, dev_output=False, phase_correction=False, clip_output=False, fs=0, peak_picking_threshold=0.01):
|
| 26 |
+
"""
|
| 27 |
+
This function is an implementation of the Vassilakis [2007] model of roughness.
|
| 28 |
+
The peak picking algorithm implemented is based on the MIR toolbox's implementation.
|
| 29 |
+
|
| 30 |
+
This version of timbral_roughness contains self loudness normalising methods and can accept arrays as an input
|
| 31 |
+
instead of a string filename.
|
| 32 |
+
|
| 33 |
+
Version 0.4
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
Vassilakis, P. 'SRA: A Aeb-based researh tool for spectral and roughness analysis of sound signals', Proceedings
|
| 37 |
+
of the 4th Sound and Music Computing Conference, Lefkada, Greece, July, 2007.
|
| 38 |
+
|
| 39 |
+
Required parameter
|
| 40 |
+
:param fname: string, Audio filename to be analysed, including full file path and extension.
|
| 41 |
+
|
| 42 |
+
Optional parameters
|
| 43 |
+
:param dev_output: bool, when False return the roughness, when True return all extracted features
|
| 44 |
+
(current none).
|
| 45 |
+
:param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
|
| 46 |
+
Defaults to False.
|
| 47 |
+
|
| 48 |
+
:return: Roughness of the audio signal.
|
| 49 |
+
|
| 50 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 51 |
+
|
| 52 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 53 |
+
you may not use this file except in compliance with the License.
|
| 54 |
+
You may obtain a copy of the License at
|
| 55 |
+
|
| 56 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 57 |
+
|
| 58 |
+
Unless required by applicable law or agreed to in writing, software
|
| 59 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 60 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 61 |
+
See the License for the specific language governing permissions and
|
| 62 |
+
limitations under the License.
|
| 63 |
+
"""
|
| 64 |
+
'''
|
| 65 |
+
Read input
|
| 66 |
+
'''
|
| 67 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 68 |
+
|
| 69 |
+
'''
|
| 70 |
+
Pad audio
|
| 71 |
+
'''
|
| 72 |
+
# pad audio
|
| 73 |
+
audio_samples = np.lib.pad(audio_samples, (512, 0), 'constant', constant_values=(0.0, 0.0))
|
| 74 |
+
|
| 75 |
+
'''
|
| 76 |
+
Reshape audio into time windows of 50ms.
|
| 77 |
+
'''
|
| 78 |
+
# reshape audio
|
| 79 |
+
audio_len = len(audio_samples)
|
| 80 |
+
time_step = 0.05
|
| 81 |
+
step_samples = int(fs * time_step)
|
| 82 |
+
nfft = step_samples
|
| 83 |
+
window = np.hamming(nfft + 2)
|
| 84 |
+
window = window[1:-1]
|
| 85 |
+
olap = nfft / 2
|
| 86 |
+
num_frames = int((audio_len)/(step_samples-olap))
|
| 87 |
+
next_pow_2 = np.log(step_samples) / np.log(2)
|
| 88 |
+
next_pow_2 = 2 ** int(next_pow_2 + 1)
|
| 89 |
+
|
| 90 |
+
reshaped_audio = np.zeros([next_pow_2, num_frames])
|
| 91 |
+
|
| 92 |
+
i = 0
|
| 93 |
+
start_idx = int((i * (nfft / 2.0)))
|
| 94 |
+
|
| 95 |
+
# check if audio is too short to be reshaped
|
| 96 |
+
if audio_len > step_samples:
|
| 97 |
+
# get all the audio
|
| 98 |
+
while start_idx+step_samples <= audio_len:
|
| 99 |
+
audio_frame = audio_samples[start_idx:start_idx+step_samples]
|
| 100 |
+
|
| 101 |
+
# apply window
|
| 102 |
+
audio_frame = audio_frame * window
|
| 103 |
+
|
| 104 |
+
# append zeros
|
| 105 |
+
reshaped_audio[:step_samples, i] = audio_frame
|
| 106 |
+
|
| 107 |
+
# increase the step
|
| 108 |
+
i += 1
|
| 109 |
+
start_idx = int((i * (nfft / 2.0)))
|
| 110 |
+
else:
|
| 111 |
+
# reshaped audio is just padded audio samples
|
| 112 |
+
reshaped_audio[:audio_len, i] = audio_samples
|
| 113 |
+
|
| 114 |
+
spec = np.fft.fft(reshaped_audio, axis=0)
|
| 115 |
+
spec_len = int(next_pow_2/2) + 1
|
| 116 |
+
spec = spec[:spec_len, :]
|
| 117 |
+
spec = np.absolute(spec)
|
| 118 |
+
|
| 119 |
+
freq = fs/2 * np.linspace(0, 1, spec_len)
|
| 120 |
+
|
| 121 |
+
# normalise spectrogram based from peak TF bin
|
| 122 |
+
norm_spec = (spec - np.min(spec)) / (np.max(spec) - np.min(spec))
|
| 123 |
+
|
| 124 |
+
''' Peak picking algorithm '''
|
| 125 |
+
cthr = peak_picking_threshold # threshold for peak picking
|
| 126 |
+
|
| 127 |
+
_, no_segments = np.shape(spec)
|
| 128 |
+
|
| 129 |
+
allpeakpos = []
|
| 130 |
+
allpeaklevel = []
|
| 131 |
+
allpeaktime = []
|
| 132 |
+
|
| 133 |
+
for i in range(0, no_segments):
|
| 134 |
+
d = norm_spec[:, i]
|
| 135 |
+
d_un = spec[:, i]
|
| 136 |
+
|
| 137 |
+
# find peak candidates
|
| 138 |
+
peak_pos, peak_level, peak_x = timbral_util.detect_peaks(d, cthr=cthr, unprocessed_array=d_un, freq=freq)
|
| 139 |
+
|
| 140 |
+
allpeakpos.append(peak_pos)
|
| 141 |
+
allpeaklevel.append(peak_level)
|
| 142 |
+
allpeaktime.append(peak_x)
|
| 143 |
+
|
| 144 |
+
''' Calculate the Vasillakis Roughness '''
|
| 145 |
+
allroughness = []
|
| 146 |
+
# for each frame
|
| 147 |
+
for frame in range(len(allpeaklevel)):
|
| 148 |
+
frame_freq = allpeaktime[frame]
|
| 149 |
+
frame_level = allpeaklevel[frame]
|
| 150 |
+
|
| 151 |
+
if len(frame_freq) > 1:
|
| 152 |
+
f2 = np.kron(np.ones([len(frame_freq), 1]), frame_freq)
|
| 153 |
+
f1 = f2.T
|
| 154 |
+
v2 = np.kron(np.ones([len(frame_level), 1]), frame_level)
|
| 155 |
+
v1 = v2.T
|
| 156 |
+
|
| 157 |
+
X = v1 * v2
|
| 158 |
+
Y = (2 * v2) / (v1 + v2)
|
| 159 |
+
Z = plomp(f1, f2)
|
| 160 |
+
rough = (X ** 0.1) * (0.5 * (Y ** 3.11)) * Z
|
| 161 |
+
|
| 162 |
+
allroughness.append(np.sum(rough))
|
| 163 |
+
else:
|
| 164 |
+
allroughness.append(0)
|
| 165 |
+
|
| 166 |
+
mean_roughness = np.mean(allroughness)
|
| 167 |
+
|
| 168 |
+
if dev_output:
|
| 169 |
+
return [mean_roughness]
|
| 170 |
+
else:
|
| 171 |
+
'''
|
| 172 |
+
Perform linear regression
|
| 173 |
+
'''
|
| 174 |
+
# cap roughness for low end
|
| 175 |
+
if mean_roughness < 0.01:
|
| 176 |
+
return 0
|
| 177 |
+
else:
|
| 178 |
+
roughness = np.log10(mean_roughness) * 13.98779569 + 48.97606571545886
|
| 179 |
+
if clip_output:
|
| 180 |
+
roughness = timbral_util.output_clip(roughness)
|
| 181 |
+
|
| 182 |
+
return roughness
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
timbral_models/Timbral_Sharpness.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from . import timbral_util
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def sharpness_Fastl(loudspec):
|
| 8 |
+
"""
|
| 9 |
+
Calculates the sharpness based on FASTL (1991)
|
| 10 |
+
Expression for weighting function obtained by fitting an
|
| 11 |
+
equation to data given in 'Psychoacoustics: Facts and Models'
|
| 12 |
+
using MATLAB basic fitting function
|
| 13 |
+
Original Matlab code by Claire Churchill Sep 2004
|
| 14 |
+
Transcoded by Andy Pearce 2018
|
| 15 |
+
"""
|
| 16 |
+
n = len(loudspec)
|
| 17 |
+
gz = np.ones(140)
|
| 18 |
+
z = np.arange(141,n+1)
|
| 19 |
+
gzz = 0.00012 * (z/10.0) ** 4 - 0.0056 * (z/10.0) ** 3 + 0.1 * (z/10.0) ** 2 -0.81 * (z/10.0) + 3.5
|
| 20 |
+
gz = np.concatenate((gz, gzz))
|
| 21 |
+
z = np.arange(0.1, n/10.0+0.1, 0.1)
|
| 22 |
+
|
| 23 |
+
sharp = 0.11 * np.sum(loudspec * gz * z * 0.1) / np.sum(loudspec * 0.1)
|
| 24 |
+
return sharp
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def timbral_sharpness(fname, dev_output=False, phase_correction=False, clip_output=False, fs=0):
|
| 28 |
+
"""
|
| 29 |
+
This is an implementation of the matlab sharpness function found at:
|
| 30 |
+
https://www.salford.ac.uk/research/sirc/research-groups/acoustics/psychoacoustics/sound-quality-making-products-sound-better/accordion/sound-quality-testing/matlab-codes
|
| 31 |
+
|
| 32 |
+
This function calculates the apparent Sharpness of an audio file.
|
| 33 |
+
This version of timbral_sharpness contains self loudness normalising methods and can accept arrays as an input
|
| 34 |
+
instead of a string filename.
|
| 35 |
+
|
| 36 |
+
Version 0.4
|
| 37 |
+
|
| 38 |
+
Originally coded by Claire Churchill Sep 2004
|
| 39 |
+
Transcoded by Andy Pearce 2018
|
| 40 |
+
|
| 41 |
+
Required parameter
|
| 42 |
+
:param fname: string, audio filename to be analysed, including full file path and extension.
|
| 43 |
+
|
| 44 |
+
Optional parameters
|
| 45 |
+
:param dev_output: bool, when False return the warmth, when True return all extracted features
|
| 46 |
+
:param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
|
| 47 |
+
Defaults to False.
|
| 48 |
+
:param clip_output: bool, bool, force the output to be between 0 and 100. Defaults to False.
|
| 49 |
+
|
| 50 |
+
:return Apparent sharpness of the audio file.
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 54 |
+
|
| 55 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 56 |
+
you may not use this file except in compliance with the License.
|
| 57 |
+
You may obtain a copy of the License at
|
| 58 |
+
|
| 59 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 60 |
+
|
| 61 |
+
Unless required by applicable law or agreed to in writing, software
|
| 62 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 63 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 64 |
+
See the License for the specific language governing permissions and
|
| 65 |
+
limitations under the License.
|
| 66 |
+
|
| 67 |
+
"""
|
| 68 |
+
'''
|
| 69 |
+
Read input
|
| 70 |
+
'''
|
| 71 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 72 |
+
|
| 73 |
+
# window the audio file into 4096 sample sections
|
| 74 |
+
windowed_audio = timbral_util.window_audio(audio_samples, window_length=4096)
|
| 75 |
+
|
| 76 |
+
windowed_sharpness = []
|
| 77 |
+
windowed_rms = []
|
| 78 |
+
for i in range(windowed_audio.shape[0]):
|
| 79 |
+
samples = windowed_audio[i, :]
|
| 80 |
+
|
| 81 |
+
# calculate the rms and append to list
|
| 82 |
+
windowed_rms.append(np.sqrt(np.mean(samples * samples)))
|
| 83 |
+
|
| 84 |
+
# calculate the specific loudness
|
| 85 |
+
N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
|
| 86 |
+
|
| 87 |
+
# calculate the sharpness if section contains audio
|
| 88 |
+
if N_entire > 0:
|
| 89 |
+
sharpness = sharpness_Fastl(N_single)
|
| 90 |
+
else:
|
| 91 |
+
sharpness = 0
|
| 92 |
+
|
| 93 |
+
windowed_sharpness.append(sharpness)
|
| 94 |
+
|
| 95 |
+
# convert lists to numpy arrays for fancy indexing
|
| 96 |
+
windowed_rms = np.array(windowed_rms)
|
| 97 |
+
windowed_sharpness = np.array(windowed_sharpness)
|
| 98 |
+
# calculate the sharpness as the rms-weighted average of sharpness
|
| 99 |
+
rms_sharpness = np.average(windowed_sharpness, weights=(windowed_rms * windowed_rms))
|
| 100 |
+
|
| 101 |
+
# take the logarithm to better much subjective ratings
|
| 102 |
+
rms_sharpness = np.log10(rms_sharpness)
|
| 103 |
+
|
| 104 |
+
if dev_output:
|
| 105 |
+
return [rms_sharpness]
|
| 106 |
+
else:
|
| 107 |
+
|
| 108 |
+
all_metrics = np.ones(2)
|
| 109 |
+
all_metrics[0] = rms_sharpness
|
| 110 |
+
|
| 111 |
+
# coefficients from linear regression
|
| 112 |
+
coefficients = [102.50508921364404, 34.432655185001735]
|
| 113 |
+
|
| 114 |
+
# apply regression
|
| 115 |
+
sharpness = np.sum(all_metrics * coefficients)
|
| 116 |
+
|
| 117 |
+
if clip_output:
|
| 118 |
+
sharpness = timbral_util.output_clip(sharpness)
|
| 119 |
+
|
| 120 |
+
return sharpness
|
timbral_models/Timbral_Warmth.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division
|
| 2 |
+
import numpy as np
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from scipy.signal import spectrogram
|
| 5 |
+
import scipy.stats
|
| 6 |
+
from sklearn import linear_model
|
| 7 |
+
from . import timbral_util
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def warm_region_cal(audio_samples, fs):
|
| 11 |
+
"""
|
| 12 |
+
Function for calculating various warmth parameters.
|
| 13 |
+
|
| 14 |
+
:param audio_samples: numpy.array, an array of the audio samples, reques only one dimension.
|
| 15 |
+
:param fs: int, the sample ratr of the audio file.
|
| 16 |
+
|
| 17 |
+
:return: four outputs: mean warmth region, weighted-average warmth region, mean high frequency level,
|
| 18 |
+
weighted-average high frequency level.
|
| 19 |
+
"""
|
| 20 |
+
#window the audio
|
| 21 |
+
windowed_samples = timbral_util.window_audio(audio_samples)
|
| 22 |
+
|
| 23 |
+
# need to define a function for the roughness stimuli, emphasising the 20 - 40 region (of the bark scale)
|
| 24 |
+
min_bark_band = 10
|
| 25 |
+
max_bark_band = 40
|
| 26 |
+
mean_bark_band = (min_bark_band + max_bark_band) / 2.0
|
| 27 |
+
array = np.arange(min_bark_band, max_bark_band)
|
| 28 |
+
x = timbral_util.normal_dist(array, theta=0.01, mean=mean_bark_band)
|
| 29 |
+
x -= np.min(x)
|
| 30 |
+
x /= np.max(x)
|
| 31 |
+
|
| 32 |
+
wr_array = np.zeros(240)
|
| 33 |
+
wr_array[min_bark_band:max_bark_band] = x
|
| 34 |
+
|
| 35 |
+
# need to define a second array emphasising the 20 - 40 region (of the bark scale)
|
| 36 |
+
min_bark_band = 80
|
| 37 |
+
max_bark_band = 240
|
| 38 |
+
mean_bark_band = (min_bark_band + max_bark_band) / 2.0
|
| 39 |
+
array = np.arange(min_bark_band, max_bark_band)
|
| 40 |
+
x = timbral_util.normal_dist(array, theta=0.01, mean=mean_bark_band)
|
| 41 |
+
x -= np.min(x)
|
| 42 |
+
x /= np.max(x)
|
| 43 |
+
|
| 44 |
+
hf_array = np.zeros(240)
|
| 45 |
+
hf_array[min_bark_band:max_bark_band] = x
|
| 46 |
+
|
| 47 |
+
windowed_loud_spec = []
|
| 48 |
+
windowed_rms = []
|
| 49 |
+
|
| 50 |
+
wr_vals = []
|
| 51 |
+
hf_vals = []
|
| 52 |
+
|
| 53 |
+
for i in range(windowed_samples.shape[0]):
|
| 54 |
+
samples = windowed_samples[i, :]
|
| 55 |
+
N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
|
| 56 |
+
|
| 57 |
+
# append the loudness spec
|
| 58 |
+
windowed_loud_spec.append(N_single)
|
| 59 |
+
windowed_rms.append(np.sqrt(np.mean(samples * samples)))
|
| 60 |
+
|
| 61 |
+
wr_vals.append(np.sum(wr_array * N_single))
|
| 62 |
+
hf_vals.append(np.sum(hf_array * N_single))
|
| 63 |
+
|
| 64 |
+
mean_wr = np.mean(wr_vals)
|
| 65 |
+
mean_hf = np.mean(hf_vals)
|
| 66 |
+
weighted_wr = np.average(wr_vals, weights=windowed_rms)
|
| 67 |
+
weighted_hf = np.average(hf_vals, weights=windowed_rms)
|
| 68 |
+
|
| 69 |
+
return mean_wr, weighted_wr, mean_hf, weighted_hf
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def timbral_warmth(fname, dev_output=False, phase_correction=False, clip_output=False, max_FFT_frame_size=8192,
|
| 73 |
+
max_WR = 12000, fs=0):
|
| 74 |
+
"""
|
| 75 |
+
This function estimates the perceptual Warmth of an audio file.
|
| 76 |
+
|
| 77 |
+
This model of timbral_warmth contains self loudness normalising methods and can accept arrays as an input
|
| 78 |
+
instead of a string filename.
|
| 79 |
+
|
| 80 |
+
Version 0.4
|
| 81 |
+
|
| 82 |
+
Required parameter
|
| 83 |
+
:param fname: string, Audio filename to be analysed, including full file path and extension.
|
| 84 |
+
|
| 85 |
+
Optional parameters
|
| 86 |
+
:param dev_output: bool, when False return the warmth, when True return all extracted features in a
|
| 87 |
+
list.
|
| 88 |
+
:param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
|
| 89 |
+
Defaults to False.
|
| 90 |
+
:param max_FFT_frame_size: int, Frame size for calculating spectrogram, default to 8192.
|
| 91 |
+
:param max_WR: float, maximun allowable warmth region frequency, defaults to 12000.
|
| 92 |
+
|
| 93 |
+
:return: Estimated warmth of audio file.
|
| 94 |
+
|
| 95 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 96 |
+
|
| 97 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 98 |
+
you may not use this file except in compliance with the License.
|
| 99 |
+
You may obtain a copy of the License at
|
| 100 |
+
|
| 101 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 102 |
+
|
| 103 |
+
Unless required by applicable law or agreed to in writing, software
|
| 104 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 105 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 106 |
+
See the License for the specific language governing permissions and
|
| 107 |
+
limitations under the License.
|
| 108 |
+
|
| 109 |
+
"""
|
| 110 |
+
'''
|
| 111 |
+
Read input
|
| 112 |
+
'''
|
| 113 |
+
audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
|
| 114 |
+
|
| 115 |
+
# get the weighted high frequency content
|
| 116 |
+
mean_wr, _, _, weighted_hf = warm_region_cal(audio_samples, fs)
|
| 117 |
+
|
| 118 |
+
# calculate the onsets
|
| 119 |
+
envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.1)
|
| 120 |
+
envelope_time = np.arange(len(envelope)) / float(fs)
|
| 121 |
+
|
| 122 |
+
# calculate the onsets
|
| 123 |
+
nperseg = 4096
|
| 124 |
+
original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg)
|
| 125 |
+
# If onsets don't exist, set it to time zero
|
| 126 |
+
if not original_onsets:
|
| 127 |
+
original_onsets = [0]
|
| 128 |
+
# set to start of file in the case where there is only one onset
|
| 129 |
+
if len(original_onsets) == 1:
|
| 130 |
+
original_onsets = [0]
|
| 131 |
+
'''
|
| 132 |
+
Initialise lists for storing features
|
| 133 |
+
'''
|
| 134 |
+
# set defaults for holding
|
| 135 |
+
all_rms = []
|
| 136 |
+
all_ratio = []
|
| 137 |
+
all_SC = []
|
| 138 |
+
all_WR_Ratio = []
|
| 139 |
+
all_decay_score = []
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# calculate metrics for each onset
|
| 143 |
+
for idx, onset in enumerate(original_onsets):
|
| 144 |
+
if onset == original_onsets[-1]:
|
| 145 |
+
# this is the last onset
|
| 146 |
+
segment = audio_samples[onset:]
|
| 147 |
+
else:
|
| 148 |
+
segment = audio_samples[onset:original_onsets[idx+1]]
|
| 149 |
+
|
| 150 |
+
segment_rms = np.sqrt(np.mean(segment * segment))
|
| 151 |
+
all_rms.append(segment_rms)
|
| 152 |
+
|
| 153 |
+
# get FFT of signal
|
| 154 |
+
segment_length = len(segment)
|
| 155 |
+
if segment_length < max_FFT_frame_size:
|
| 156 |
+
freq, time, spec = spectrogram(segment, fs, nperseg=segment_length, nfft=max_FFT_frame_size)
|
| 157 |
+
else:
|
| 158 |
+
freq, time, spec = spectrogram(segment, fs, nperseg=max_FFT_frame_size, nfft=max_FFT_frame_size)
|
| 159 |
+
|
| 160 |
+
# flatten the audio to 1 dimension. Catches some strange errors that cause crashes
|
| 161 |
+
if spec.shape[1] > 1:
|
| 162 |
+
spec = np.sum(spec, axis=1)
|
| 163 |
+
spec = spec.flatten()
|
| 164 |
+
|
| 165 |
+
# normalise for this onset
|
| 166 |
+
spec = np.array(list(spec)).flatten()
|
| 167 |
+
this_shape = spec.shape
|
| 168 |
+
spec /= max(abs(spec))
|
| 169 |
+
|
| 170 |
+
'''
|
| 171 |
+
Estimate of fundamental frequency
|
| 172 |
+
'''
|
| 173 |
+
# peak picking algorithm
|
| 174 |
+
peak_idx, peak_value, peak_x = timbral_util.detect_peaks(spec, freq=freq, fs=fs)
|
| 175 |
+
# find lowest peak
|
| 176 |
+
fundamental = np.min(peak_x)
|
| 177 |
+
fundamental_idx = np.min(peak_idx)
|
| 178 |
+
|
| 179 |
+
'''
|
| 180 |
+
Warmth region calculation
|
| 181 |
+
'''
|
| 182 |
+
# estimate the Warmth region
|
| 183 |
+
WR_upper_f_limit = fundamental * 3.5
|
| 184 |
+
if WR_upper_f_limit > max_WR:
|
| 185 |
+
WR_upper_f_limit = 12000
|
| 186 |
+
tpower = np.sum(spec)
|
| 187 |
+
WR_upper_f_limit_idx = int(np.where(freq > WR_upper_f_limit)[0][0])
|
| 188 |
+
|
| 189 |
+
if fundamental < 260:
|
| 190 |
+
# find frequency bin closest to 260Hz
|
| 191 |
+
top_level_idx = int(np.where(freq > 260)[0][0])
|
| 192 |
+
# sum energy up to this bin
|
| 193 |
+
low_energy = np.sum(spec[fundamental_idx:top_level_idx])
|
| 194 |
+
# sum all energy
|
| 195 |
+
tpower = np.sum(spec)
|
| 196 |
+
# take ratio
|
| 197 |
+
ratio = low_energy / float(tpower)
|
| 198 |
+
else:
|
| 199 |
+
# make exception where fundamental is greater than
|
| 200 |
+
ratio = 0
|
| 201 |
+
|
| 202 |
+
all_ratio.append(ratio)
|
| 203 |
+
|
| 204 |
+
'''
|
| 205 |
+
Spectral centroid of the segment
|
| 206 |
+
'''
|
| 207 |
+
# spectral centroid
|
| 208 |
+
top = np.sum(freq * spec)
|
| 209 |
+
bottom = float(np.sum(spec))
|
| 210 |
+
SC = np.sum(freq * spec) / float(np.sum(spec))
|
| 211 |
+
all_SC.append(SC)
|
| 212 |
+
|
| 213 |
+
'''
|
| 214 |
+
HF decay
|
| 215 |
+
- linear regression of the values above the warmth region
|
| 216 |
+
'''
|
| 217 |
+
above_WR_spec = np.log10(spec[WR_upper_f_limit_idx:])
|
| 218 |
+
above_WR_freq = np.log10(freq[WR_upper_f_limit_idx:])
|
| 219 |
+
np.ones_like(above_WR_freq)
|
| 220 |
+
metrics = np.array([above_WR_freq, np.ones_like(above_WR_freq)])
|
| 221 |
+
|
| 222 |
+
# create a linear regression model
|
| 223 |
+
model = linear_model.LinearRegression(fit_intercept=False)
|
| 224 |
+
model.fit(metrics.transpose(), above_WR_spec)
|
| 225 |
+
decay_score = model.score(metrics.transpose(), above_WR_spec)
|
| 226 |
+
all_decay_score.append(decay_score)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
'''
|
| 230 |
+
get mean values
|
| 231 |
+
'''
|
| 232 |
+
mean_SC = np.log10(np.mean(all_SC))
|
| 233 |
+
mean_decay_score = np.mean(all_decay_score)
|
| 234 |
+
weighted_mean_ratio = np.average(all_ratio, weights=all_rms)
|
| 235 |
+
|
| 236 |
+
if dev_output:
|
| 237 |
+
return mean_SC, weighted_hf, mean_wr, mean_decay_score, weighted_mean_ratio
|
| 238 |
+
else:
|
| 239 |
+
|
| 240 |
+
'''
|
| 241 |
+
Apply regression model
|
| 242 |
+
'''
|
| 243 |
+
all_metrics = np.ones(6)
|
| 244 |
+
all_metrics[0] = mean_SC
|
| 245 |
+
all_metrics[1] = weighted_hf
|
| 246 |
+
all_metrics[2] = mean_wr
|
| 247 |
+
all_metrics[3] = mean_decay_score
|
| 248 |
+
all_metrics[4] = weighted_mean_ratio
|
| 249 |
+
|
| 250 |
+
coefficients = np.array([-4.464258317026696,
|
| 251 |
+
-0.08819320850778556,
|
| 252 |
+
0.29156539973575546,
|
| 253 |
+
17.274733561081554,
|
| 254 |
+
8.403340066029507,
|
| 255 |
+
45.21212125085579])
|
| 256 |
+
|
| 257 |
+
warmth = np.sum(all_metrics * coefficients)
|
| 258 |
+
|
| 259 |
+
# clip output between 0 and 100
|
| 260 |
+
if clip_output:
|
| 261 |
+
warmth = timbral_util.output_clip(warmth)
|
| 262 |
+
|
| 263 |
+
return warmth
|
timbral_models/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__version__ = '0.4.1'
|
| 2 |
+
|
| 3 |
+
from .Timbral_Brightness import timbral_brightness
|
| 4 |
+
from .Timbral_Depth import timbral_depth
|
| 5 |
+
from .Timbral_Hardness import timbral_hardness
|
| 6 |
+
from .Timbral_Roughness import timbral_roughness
|
| 7 |
+
from .Timbral_Warmth import timbral_warmth
|
| 8 |
+
from .Timbral_Sharpness import timbral_sharpness
|
| 9 |
+
from .Timbral_Booming import timbral_booming
|
| 10 |
+
from .Timbral_Reverb import timbral_reverb
|
| 11 |
+
from .Timbral_Extractor import timbral_extractor
|
| 12 |
+
from .timbral_util import *
|
timbral_models/__pycache__/Timbral_Booming.cpython-310.pyc
ADDED
|
Binary file (5.03 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Brightness.cpython-310.pyc
ADDED
|
Binary file (4.98 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Depth.cpython-310.pyc
ADDED
|
Binary file (6.56 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Extractor.cpython-310.pyc
ADDED
|
Binary file (3.71 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Hardness.cpython-310.pyc
ADDED
|
Binary file (5.98 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Reverb.cpython-310.pyc
ADDED
|
Binary file (8.36 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Roughness.cpython-310.pyc
ADDED
|
Binary file (4.55 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Sharpness.cpython-310.pyc
ADDED
|
Binary file (3.84 kB). View file
|
|
|
timbral_models/__pycache__/Timbral_Warmth.cpython-310.pyc
ADDED
|
Binary file (5.86 kB). View file
|
|
|
timbral_models/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (702 Bytes). View file
|
|
|
timbral_models/__pycache__/timbral_util.cpython-310.pyc
ADDED
|
Binary file (41.7 kB). View file
|
|
|
timbral_models/timbral_util.py
ADDED
|
@@ -0,0 +1,1816 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import division, print_function
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
from scipy.signal import butter, lfilter, spectrogram
|
| 6 |
+
import scipy.stats
|
| 7 |
+
import pyloudnorm as pyln
|
| 8 |
+
import six
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
The timbral util is a collection of functions that can be accessed by the individual timbral models. These can be
|
| 12 |
+
used for extracting features or manipulating the audio that are useful to multiple attributes.
|
| 13 |
+
|
| 14 |
+
Version 0.4
|
| 15 |
+
|
| 16 |
+
Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
|
| 17 |
+
|
| 18 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 19 |
+
you may not use this file except in compliance with the License.
|
| 20 |
+
You may obtain a copy of the License at
|
| 21 |
+
|
| 22 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 23 |
+
|
| 24 |
+
Unless required by applicable law or agreed to in writing, software
|
| 25 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 26 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 27 |
+
See the License for the specific language governing permissions and
|
| 28 |
+
limitations under the License.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def db2mag(dB):
|
| 33 |
+
"""
|
| 34 |
+
Converts from dB to linear magnitude.
|
| 35 |
+
|
| 36 |
+
:param dB: dB level to be converted.
|
| 37 |
+
:return: linear magnitude of the dB input.
|
| 38 |
+
"""
|
| 39 |
+
mag = 10 ** (dB / 20.0)
|
| 40 |
+
return mag
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_percussive_audio(audio_samples, return_ratio=True):
|
| 44 |
+
"""
|
| 45 |
+
Gets the percussive comonent of the audio file.
|
| 46 |
+
Currently, the default values for harmonic/percussive decomposition have been used.
|
| 47 |
+
Future updates may change the defaults for better separation or to improve the correlation to subjective data.
|
| 48 |
+
|
| 49 |
+
:param audio_samples: The audio samples to be harmonicall/percussively separated
|
| 50 |
+
:param return_ratio: Determins the value returned by the function.
|
| 51 |
+
|
| 52 |
+
:return: If return_ratio is True (default), the ratio of percussive energy is returned.
|
| 53 |
+
If False, the function returns the percussive audio as a time domain array.
|
| 54 |
+
"""
|
| 55 |
+
# use librosa decomposition
|
| 56 |
+
D = librosa.core.stft(audio_samples)
|
| 57 |
+
H, P = librosa.decompose.hpss(D)
|
| 58 |
+
|
| 59 |
+
# inverse transform to get time domain arrays
|
| 60 |
+
percussive_audio = librosa.core.istft(P)
|
| 61 |
+
harmonic_audio = librosa.core.istft(H)
|
| 62 |
+
|
| 63 |
+
if return_ratio:
|
| 64 |
+
# frame by frame RMS energy
|
| 65 |
+
percussive_energy = calculate_rms_enveope(percussive_audio, step_size=1024, overlap_step=512, normalise=False)
|
| 66 |
+
harmonic_energy = calculate_rms_enveope(harmonic_audio, step_size=1024, overlap_step=512, normalise=False)
|
| 67 |
+
|
| 68 |
+
# set defaults for storing the data
|
| 69 |
+
ratio = []
|
| 70 |
+
t_power = []
|
| 71 |
+
|
| 72 |
+
#get the ratio for each RMS time frame
|
| 73 |
+
for i in range(len(percussive_energy)):
|
| 74 |
+
if percussive_energy[i] != 0 or harmonic_energy[i] != 0:
|
| 75 |
+
# if percussive_energy[i] != 0 and harmonic_energy[i] != 0:
|
| 76 |
+
ratio.append(percussive_energy[i] / (percussive_energy[i] + harmonic_energy[i]))
|
| 77 |
+
t_power.append((percussive_energy[i] + harmonic_energy[i]))
|
| 78 |
+
|
| 79 |
+
if t_power:
|
| 80 |
+
# take a weighted average of the ratio
|
| 81 |
+
ratio = np.average(ratio, weights=t_power)
|
| 82 |
+
return ratio
|
| 83 |
+
else:
|
| 84 |
+
# return the percussive audio when return_ratio is False
|
| 85 |
+
return percussive_audio
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def filter_audio_highpass(audio_samples, crossover, fs, order=2):
|
| 89 |
+
""" Calculate and apply a high-pass filter, with a -3dB point of crossover.
|
| 90 |
+
|
| 91 |
+
:param audio_samples: data to be filtered as an array.
|
| 92 |
+
:param crossover: the crossover frequency of the filter.
|
| 93 |
+
:param fs: the sampling frequency of the audio file.
|
| 94 |
+
:param order: order of the filter, defaults to 2.
|
| 95 |
+
|
| 96 |
+
:return: filtered array.
|
| 97 |
+
"""
|
| 98 |
+
nyq = 0.5 * fs
|
| 99 |
+
xfreq = crossover / nyq
|
| 100 |
+
b, a = butter(order, xfreq, 'high')
|
| 101 |
+
y = lfilter(b, a, audio_samples)
|
| 102 |
+
return y
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def filter_audio_lowpass(audio_samples, crossover, fs, order=2):
|
| 106 |
+
""" Calculate and apply a low-pass filter, with a -3dB point of crossover.
|
| 107 |
+
|
| 108 |
+
:param audio_samples: data to be filtered as an array.
|
| 109 |
+
:param crossover: the crossover frequency of the filter.
|
| 110 |
+
:param fs: the sampling frequency of the audio file.
|
| 111 |
+
:param order: order of the filter, defaults to 2.
|
| 112 |
+
|
| 113 |
+
:return: filtered array.
|
| 114 |
+
"""
|
| 115 |
+
nyq = 0.5 * fs
|
| 116 |
+
xfreq = crossover / nyq
|
| 117 |
+
b, a = butter(order, xfreq, 'low')
|
| 118 |
+
y = lfilter(b, a, audio_samples)
|
| 119 |
+
return y
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def butter_bandpass(lowcut, highcut, fs, order=2):
|
| 123 |
+
""" Design a butterworth bandpass filter """
|
| 124 |
+
nyq = 0.5 * fs
|
| 125 |
+
low = lowcut / nyq
|
| 126 |
+
high = highcut / nyq
|
| 127 |
+
b, a = butter(order, [low, high], btype='band')
|
| 128 |
+
return b, a
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def filter_audio_bandpass(audio_samples, f0, noct, fs, order=2):
|
| 132 |
+
""" Calculate and apply an n/octave butterworth bandpass filter, centred at f0 Hz.
|
| 133 |
+
|
| 134 |
+
:param audio_samples: the audio file as an array
|
| 135 |
+
:param fs: the sampling frequency of the audio file
|
| 136 |
+
:param f0: the centre frequency of the bandpass filter
|
| 137 |
+
:param bandwidth: the bandwidth of the filter
|
| 138 |
+
:param order: order of the filter, defaults to 2
|
| 139 |
+
|
| 140 |
+
:return: audio file filtered
|
| 141 |
+
"""
|
| 142 |
+
fd = 2 ** (1.0 / (noct * 2))
|
| 143 |
+
lowcut = f0 / fd
|
| 144 |
+
highcut = f0 * fd
|
| 145 |
+
|
| 146 |
+
b, a = butter_bandpass(lowcut, highcut, fs, order=order)
|
| 147 |
+
y = lfilter(b, a, audio_samples)
|
| 148 |
+
return y
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def return_loop(onset_loc, envelope, function_time_thresh, hist_threshold, hist_time_samples, nperseg=512):
|
| 152 |
+
""" This function is used by the calculate_onsets method.
|
| 153 |
+
This looks backwards in time from the attack time and attempts to find the exact onset point by
|
| 154 |
+
identifying the point backwards in time where the envelope no longer falls.
|
| 155 |
+
This function includes a hyteresis to account for small deviations in the attack due to the
|
| 156 |
+
envelope calculation.
|
| 157 |
+
|
| 158 |
+
Function looks 10ms (function_time_thresh) backwards from the onset time (onset_loc), looking for any sample
|
| 159 |
+
lower than the current sample. This repeats, starting at the minimum value until no smaller value is found.
|
| 160 |
+
Then the function looks backwards over 200ms, checking if the increase is greater than 10% of the full envelope's
|
| 161 |
+
dynamic range.
|
| 162 |
+
|
| 163 |
+
onset_loc: The onset location estimated by librosa (converted to time domain index)
|
| 164 |
+
envelope: Envelope of the audio file
|
| 165 |
+
function_time_thresh: Time threshold for looking backwards in time. Set in the timbral_hardness code
|
| 166 |
+
to be the number of samples that equates to 10ms
|
| 167 |
+
hist_threshold: Level threshold to check over 200ms if the peak is small enough to continue looking
|
| 168 |
+
backwards in time.
|
| 169 |
+
hist_time_samples: Number of samples to look back after finding the minimum value over 10ms, set to 200ms.
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
# define flag for exiting while loop
|
| 173 |
+
found_start = False
|
| 174 |
+
|
| 175 |
+
while not found_start:
|
| 176 |
+
# get the current sample value
|
| 177 |
+
current_sample = envelope[int(onset_loc)]
|
| 178 |
+
# get the previous 10ms worth of samples
|
| 179 |
+
if onset_loc - function_time_thresh > 0:
|
| 180 |
+
evaluation_array = envelope[onset_loc - function_time_thresh - 1:onset_loc]
|
| 181 |
+
else:
|
| 182 |
+
evaluation_array = envelope[:onset_loc - 1]
|
| 183 |
+
|
| 184 |
+
if min(evaluation_array) - current_sample <= 0:
|
| 185 |
+
'''
|
| 186 |
+
If the minimum value within previous 10ms is less than current sample,
|
| 187 |
+
move to the start position to the minimum value and look again.
|
| 188 |
+
'''
|
| 189 |
+
min_idx = np.argmin(evaluation_array)
|
| 190 |
+
new_onset_loc = min_idx + onset_loc - function_time_thresh - 1
|
| 191 |
+
|
| 192 |
+
if new_onset_loc > nperseg:
|
| 193 |
+
onset_loc = new_onset_loc
|
| 194 |
+
else:
|
| 195 |
+
''' Current index is close to start of the envelope, so exit with the idx as 512 '''
|
| 196 |
+
return 0
|
| 197 |
+
|
| 198 |
+
else:
|
| 199 |
+
'''
|
| 200 |
+
If the minimum value within previous 10ms is greater than current sample,
|
| 201 |
+
introduce the time and level hysteresis to check again.
|
| 202 |
+
'''
|
| 203 |
+
# get the array of 200ms previous to the current onset idx
|
| 204 |
+
if (onset_loc - hist_time_samples - 1) > 0:
|
| 205 |
+
hyst_evaluation_array = envelope[onset_loc - hist_time_samples - 1:onset_loc]
|
| 206 |
+
else:
|
| 207 |
+
hyst_evaluation_array = envelope[:onset_loc]
|
| 208 |
+
|
| 209 |
+
# values less than current sample
|
| 210 |
+
all_match = np.where(hyst_evaluation_array < envelope[onset_loc])
|
| 211 |
+
|
| 212 |
+
# if no minimum was found within the extended time, exit with current onset idx
|
| 213 |
+
if len(all_match[0]) == 0:
|
| 214 |
+
return onset_loc
|
| 215 |
+
|
| 216 |
+
# get the idx of the closest value which is lower than the current onset idx
|
| 217 |
+
last_min = all_match[0][-1]
|
| 218 |
+
last_idx = int(onset_loc - len(hyst_evaluation_array) + last_min)
|
| 219 |
+
|
| 220 |
+
# get the dynamic range of this segment
|
| 221 |
+
segment_dynamic_range = max(hyst_evaluation_array[last_min:]) - min(hyst_evaluation_array[last_min:])
|
| 222 |
+
|
| 223 |
+
# compare this dynamic range against the hyteresis threshold
|
| 224 |
+
if segment_dynamic_range >= hist_threshold:
|
| 225 |
+
'''
|
| 226 |
+
The dynamic range is greater than the threshold, therefore this is a separate audio event.
|
| 227 |
+
Return the current onset idx.
|
| 228 |
+
'''
|
| 229 |
+
return onset_loc
|
| 230 |
+
else:
|
| 231 |
+
'''
|
| 232 |
+
The dynamic range is less than the threshold, therefore this is not a separate audio event.
|
| 233 |
+
Set current onset idx to minimum value and repeat.
|
| 234 |
+
'''
|
| 235 |
+
if last_idx >= nperseg:
|
| 236 |
+
onset_loc = last_idx
|
| 237 |
+
else:
|
| 238 |
+
'''
|
| 239 |
+
The hysteresis check puts the new threshold too close to the start
|
| 240 |
+
'''
|
| 241 |
+
return 0
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.2, hold_time=0.01):
|
| 245 |
+
"""
|
| 246 |
+
Calculates the envelope of audio_samples with a 'sample and hold' style function.
|
| 247 |
+
This ensures that the minimum attack time is not limited by low-pass filtering,
|
| 248 |
+
a common method of obtaining the envelope.
|
| 249 |
+
|
| 250 |
+
:param audio_samples: audio array
|
| 251 |
+
:param fs: sampling frequency
|
| 252 |
+
:param decay_time: decay time after peak hold
|
| 253 |
+
:param hold_time: hold time when identifying a decay
|
| 254 |
+
|
| 255 |
+
:return: envelope of audio_samples
|
| 256 |
+
"""
|
| 257 |
+
# rectify the audio signal
|
| 258 |
+
abs_samples = abs(audio_samples)
|
| 259 |
+
envelope = []
|
| 260 |
+
|
| 261 |
+
# set parameters for envelope function
|
| 262 |
+
decay = max(abs_samples) / (decay_time * fs) # decay rate relative to peak level of audio signal
|
| 263 |
+
hold_samples = hold_time * fs # number of samples to hold before decay
|
| 264 |
+
hold_counter = 0
|
| 265 |
+
previous_sample = 0.0
|
| 266 |
+
|
| 267 |
+
# perform the sample, hold, and decay function to obtain envelope
|
| 268 |
+
for sample in abs_samples:
|
| 269 |
+
if sample >= previous_sample:
|
| 270 |
+
envelope.append(sample)
|
| 271 |
+
previous_sample = sample
|
| 272 |
+
hold_counter = 0
|
| 273 |
+
else:
|
| 274 |
+
# check hold length
|
| 275 |
+
if hold_counter < hold_samples:
|
| 276 |
+
hold_counter += 1
|
| 277 |
+
envelope.append(previous_sample)
|
| 278 |
+
else:
|
| 279 |
+
out = previous_sample - decay
|
| 280 |
+
if out > sample:
|
| 281 |
+
envelope.append(out)
|
| 282 |
+
previous_sample = out
|
| 283 |
+
else:
|
| 284 |
+
envelope.append(sample)
|
| 285 |
+
previous_sample = sample
|
| 286 |
+
|
| 287 |
+
# convert to numpy array
|
| 288 |
+
return np.array(envelope)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def get_spectral_features(audio, fs, lf_limit=20, scale='hz', cref=27.5, power=2, window_type='none',
|
| 292 |
+
rollon_thresh=0.05):
|
| 293 |
+
"""
|
| 294 |
+
This function calculates the spectral centroid and spectral spread of an audio array.
|
| 295 |
+
|
| 296 |
+
:param audio: Audio array
|
| 297 |
+
:param fs: Sample rate of audio file
|
| 298 |
+
:param lf_limit: Low frequency limit, in Hz, to be analysed. Defaults to 20Hz.
|
| 299 |
+
:param scale: The frequency scale that calculations should be made over. if no argument is given, this
|
| 300 |
+
defaults to 'hz', representing a linear frequency scale. Options are 'hz', 'mel', 'erb',
|
| 301 |
+
or 'cents'.
|
| 302 |
+
:param cref: The reference frequency for calculating cents. Defaults to 27.5Hz.
|
| 303 |
+
:param power: The power to raise devaition from specteal centroid, defaults to 2.
|
| 304 |
+
|
| 305 |
+
:return: Returns the spectral centroid, spectral spread, and unitless centroid.
|
| 306 |
+
"""
|
| 307 |
+
# use a hanning window
|
| 308 |
+
if window_type == 'hann':
|
| 309 |
+
window = np.hanning(len(audio))
|
| 310 |
+
elif window_type == 'none':
|
| 311 |
+
window = np.ones(len(audio))
|
| 312 |
+
else:
|
| 313 |
+
raise ValueError('Window type must be set to either \'hann\' or \'none\'')
|
| 314 |
+
|
| 315 |
+
next_pow_2 = int(pow(2, np.ceil(np.log2(len(window)))))
|
| 316 |
+
# get frequency domain representation
|
| 317 |
+
spectrum = np.fft.fft((window * audio), next_pow_2)
|
| 318 |
+
spectrum = np.absolute(spectrum[0:int(len(spectrum) / 2) + 1])
|
| 319 |
+
|
| 320 |
+
tpower = np.sum(spectrum)
|
| 321 |
+
|
| 322 |
+
if tpower > 0:
|
| 323 |
+
freq = np.arange(0, len(spectrum), 1) * (fs / (2.0 * (len(spectrum) - 1)))
|
| 324 |
+
|
| 325 |
+
# find lowest frequency index, zeros used to unpack result
|
| 326 |
+
lf_limit_idx = np.where(freq >= lf_limit)[0][0]
|
| 327 |
+
spectrum = spectrum[lf_limit_idx:]
|
| 328 |
+
freq = freq[lf_limit_idx:]
|
| 329 |
+
|
| 330 |
+
# convert frequency to desired frequency scale
|
| 331 |
+
if scale == 'hz':
|
| 332 |
+
freq = freq
|
| 333 |
+
elif scale == 'mel':
|
| 334 |
+
freq = 1127.0 * np.log(1 + (freq / 700.0))
|
| 335 |
+
elif scale == 'erb':
|
| 336 |
+
freq = 21.4 * np.log10(1 + (0.00437 * freq))
|
| 337 |
+
elif freq == 'cents':
|
| 338 |
+
# for cents, a
|
| 339 |
+
freq = 1200.0 * np.log2((freq / cref) + 1.0)
|
| 340 |
+
else:
|
| 341 |
+
raise ValueError('Frequency scale type not recognised. Please use \'hz\', \'mel\', \'erb\', or \'cents\'.')
|
| 342 |
+
|
| 343 |
+
# calculate centroid and spread
|
| 344 |
+
centroid = sum(spectrum * freq) / float(sum(spectrum))
|
| 345 |
+
|
| 346 |
+
# old calculation of spread
|
| 347 |
+
deviation = np.abs(freq - centroid)
|
| 348 |
+
spread = np.sqrt(np.sum((deviation ** 2) * spectrum) / np.sum(spectrum))
|
| 349 |
+
|
| 350 |
+
# new calculation of spread according to librosa
|
| 351 |
+
# spread = np.sqrt(np.sum(spectrum * (deviation ** power))) #** (1. / power))
|
| 352 |
+
|
| 353 |
+
cumulative_spectral_power = spectrum[0]
|
| 354 |
+
counter = 0
|
| 355 |
+
rollon_threshold = np.sum(spectrum) * rollon_thresh
|
| 356 |
+
while cumulative_spectral_power < rollon_threshold:
|
| 357 |
+
counter += 1
|
| 358 |
+
cumulative_spectral_power = np.sum(spectrum[:counter])
|
| 359 |
+
|
| 360 |
+
if counter == 0:
|
| 361 |
+
counter = 1
|
| 362 |
+
|
| 363 |
+
rollon_frequency = freq[counter]
|
| 364 |
+
unitless_centroid = centroid / rollon_frequency
|
| 365 |
+
|
| 366 |
+
return centroid, spread, unitless_centroid
|
| 367 |
+
else:
|
| 368 |
+
return 0
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def calculate_attack_time(envelope_samples, fs, calculate_attack_segment=True, thresh_no=8, normalise=True, m=3,
|
| 372 |
+
calculation_type='min_effort', gradient_calulation_type='all', return_descriptive_data=False,
|
| 373 |
+
max_attack_time=-1):
|
| 374 |
+
"""
|
| 375 |
+
Calculate the attack time from the envelope of a signal.
|
| 376 |
+
|
| 377 |
+
Required inputs
|
| 378 |
+
:param envelope_samples: envelope of the audio file, suggested to be calculated with
|
| 379 |
+
sample_and_hold_envelope_calculation.
|
| 380 |
+
:param fs: sample rate of the envelope_samples.
|
| 381 |
+
|
| 382 |
+
Optional inputs
|
| 383 |
+
:param calculate_attack_segment: If the attack segment of the onset should be calculated before estimating the
|
| 384 |
+
attack time. bool, default to True.
|
| 385 |
+
:param thresh_no: Number of thresholds used for calculating the minimum effort method.
|
| 386 |
+
int, default to 8.
|
| 387 |
+
:param m: value used for computation of minimum effort thresholds, defaults to 3 as s
|
| 388 |
+
uggested in the CUIDADO project.
|
| 389 |
+
:param calculation_type: method for calculating the attack time, options are 'min_effort' or
|
| 390 |
+
'fixed_threshold', default to 'min_effort'.
|
| 391 |
+
:param gradient_calulation_type: Method for calculating the gradient of the attack, options are 'all' for
|
| 392 |
+
calculating the gradient from the estimated start and end points, or 'mean' for
|
| 393 |
+
calculating the mean gradient between each threshold step in the minimum effort
|
| 394 |
+
method. Defaults to 'all' and will revert to 'all' if mean is not available.
|
| 395 |
+
:param normalise: Normalise the attack segment. bool, default to True.
|
| 396 |
+
:param return_descriptive_data Default to False, if set to True also returns the thresholds for calculating
|
| 397 |
+
the min_effort method.
|
| 398 |
+
:param max_attack_time: sets the maximum allowable attack time. Defaults to -1, indicating that there
|
| 399 |
+
is no maximum attack time. This value should be set in seconds.
|
| 400 |
+
|
| 401 |
+
:return: returns the attack_time, attack_gradient, index of the attack start, and
|
| 402 |
+
temporal centroid.
|
| 403 |
+
"""
|
| 404 |
+
if normalise:
|
| 405 |
+
# normalise the segments
|
| 406 |
+
normalise_factor = float(max(envelope_samples))
|
| 407 |
+
envelope_samples /= normalise_factor
|
| 408 |
+
|
| 409 |
+
if calculate_attack_segment:
|
| 410 |
+
# identify pre-attack segment
|
| 411 |
+
peak_idx = np.argmax(envelope_samples)
|
| 412 |
+
if peak_idx == 0:
|
| 413 |
+
# exit on error
|
| 414 |
+
return 0
|
| 415 |
+
# min_pre_peak_idx = np.argmin(envelope_samples[:peak_idx])
|
| 416 |
+
min_pre_peak_idx = np.where(envelope_samples[:peak_idx] == min(envelope_samples[:peak_idx]))[-1][-1]
|
| 417 |
+
|
| 418 |
+
# redefine the envelope samples as just the min to the peak
|
| 419 |
+
envelope_samples = envelope_samples[min_pre_peak_idx:peak_idx + 1]
|
| 420 |
+
else:
|
| 421 |
+
min_pre_peak_idx = 0
|
| 422 |
+
|
| 423 |
+
# calculate the appropriate start and end of the attack using the selected method
|
| 424 |
+
if calculation_type == 'min_effort':
|
| 425 |
+
# get threshold time array
|
| 426 |
+
threshold_step = 1.0 / (thresh_no + 2) # +2 is to ignore the 0 and 100% levels.
|
| 427 |
+
dyn_range = max(envelope_samples) - min(envelope_samples)
|
| 428 |
+
thresh_level = np.linspace(threshold_step, (1 - threshold_step), thresh_no + 1)
|
| 429 |
+
thresh_level = (thresh_level * dyn_range) + min(envelope_samples)
|
| 430 |
+
|
| 431 |
+
# predefine an array for when each threshold is crossed
|
| 432 |
+
threshold_idxs = np.zeros(thresh_no + 1)
|
| 433 |
+
|
| 434 |
+
# get indexes for when threshold is crossed
|
| 435 |
+
for j in range(len(thresh_level)):
|
| 436 |
+
threshold_hold = np.argmax(envelope_samples >= thresh_level[j])
|
| 437 |
+
# threshold_idxs[j] = threshold_hold + min_pre_peak_idx
|
| 438 |
+
threshold_idxs[j] = threshold_hold
|
| 439 |
+
|
| 440 |
+
# calculate effort values (distances between thresholds)
|
| 441 |
+
effort = np.diff(threshold_idxs)
|
| 442 |
+
|
| 443 |
+
# get the mean effort value
|
| 444 |
+
effort_mean = np.mean(effort)
|
| 445 |
+
effort_threshold = effort_mean * m
|
| 446 |
+
|
| 447 |
+
# find start and stop times foxr the attack
|
| 448 |
+
th_start = np.argmax(effort <= effort_threshold)
|
| 449 |
+
|
| 450 |
+
# need to use remaining effort values
|
| 451 |
+
effort_hold = effort[th_start:]
|
| 452 |
+
th_end = np.argmax(effort_hold >= effort_threshold) # this returns a 0 if value not found
|
| 453 |
+
if th_end == 0:
|
| 454 |
+
th_end = len(effort_hold) - 1 # make equal to the last value
|
| 455 |
+
|
| 456 |
+
# apply correction for holding the values
|
| 457 |
+
th_end = th_end + th_start
|
| 458 |
+
|
| 459 |
+
# get the actual start and stop index
|
| 460 |
+
th_start_idx = threshold_idxs[th_start]
|
| 461 |
+
th_end_idx = threshold_idxs[th_end]
|
| 462 |
+
|
| 463 |
+
if th_start_idx == th_end_idx:
|
| 464 |
+
th_start_idx = threshold_idxs[0]
|
| 465 |
+
th_end_idx = threshold_idxs[-1]
|
| 466 |
+
|
| 467 |
+
if th_start_idx == th_end_idx:
|
| 468 |
+
attack_time = 1.0 / fs
|
| 469 |
+
else:
|
| 470 |
+
attack_time = (th_end_idx - th_start_idx + 1.0) / fs
|
| 471 |
+
|
| 472 |
+
if max_attack_time > 0:
|
| 473 |
+
if attack_time > max_attack_time:
|
| 474 |
+
# how many samples is equivalent to the maximum?
|
| 475 |
+
max_attack_time_sample = int(fs * max_attack_time) # convert to integer
|
| 476 |
+
th_end_idx = th_start_idx + max_attack_time_sample
|
| 477 |
+
attack_time = (th_end_idx - th_start_idx + 1.0) / fs
|
| 478 |
+
|
| 479 |
+
start_level = envelope_samples[int(th_start_idx)]
|
| 480 |
+
end_level = envelope_samples[int(th_end_idx)]
|
| 481 |
+
|
| 482 |
+
# specify exceptions for a step functions crossing both thresholds
|
| 483 |
+
if start_level == end_level:
|
| 484 |
+
if th_start_idx > 0:
|
| 485 |
+
# if a previous sample is avaiable, take the previous starting sample
|
| 486 |
+
start_level = envelope_samples[int(th_start_idx) - 1]
|
| 487 |
+
else:
|
| 488 |
+
# set start level to zero if onset is at the first sample (indicating a step function at time zero)
|
| 489 |
+
start_level = 0.0
|
| 490 |
+
|
| 491 |
+
# is there enough data to calculate the mean
|
| 492 |
+
if gradient_calulation_type == 'mean':
|
| 493 |
+
if (end_level - start_level) < 0.2 or (th_end_idx - th_start_idx) < 2:
|
| 494 |
+
# force calculation type to all
|
| 495 |
+
gradient_calulation_type = 'all'
|
| 496 |
+
print('unable to calculate attack gradient with the \'mean\' method, reverting to \'all\' method.')
|
| 497 |
+
|
| 498 |
+
if gradient_calulation_type == 'mean':
|
| 499 |
+
# calculate the gradient based on the weighted mean of each attack
|
| 500 |
+
threshold_step = dyn_range / (thresh_no + 2)
|
| 501 |
+
|
| 502 |
+
gradient_thresh_array = np.arange(start_level, end_level + (threshold_step * dyn_range),
|
| 503 |
+
(threshold_step * dyn_range))
|
| 504 |
+
cross_threshold_times = np.zeros(len(gradient_thresh_array))
|
| 505 |
+
cross_threshold_values = np.zeros(len(gradient_thresh_array))
|
| 506 |
+
gradient_envelope_segment = envelope_samples[th_start_idx:th_end_idx + 1]
|
| 507 |
+
|
| 508 |
+
for i in range(len(cross_threshold_values)):
|
| 509 |
+
hold = np.argmax(gradient_envelope_segment >= gradient_thresh_array[i])
|
| 510 |
+
cross_threshold_times[i] = hold[0] / float(fs)
|
| 511 |
+
cross_threshold_values[i] = gradient_envelope_segment[hold[0]]
|
| 512 |
+
|
| 513 |
+
pente_v = np.diff(cross_threshold_values) / np.diff(cross_threshold_times)
|
| 514 |
+
|
| 515 |
+
# calculate weighted average of all gradients with a gausian dsitribution
|
| 516 |
+
m_threshold = 0.5 * (gradient_thresh_array[:-1] + gradient_thresh_array[1:])
|
| 517 |
+
weight_v = np.exp(-(m_threshold - 0.5) ** 2 / (0.5 ** 2))
|
| 518 |
+
|
| 519 |
+
attack_gradient = np.sum(pente_v * weight_v) / np.sum(weight_v)
|
| 520 |
+
|
| 521 |
+
elif gradient_calulation_type == 'all':
|
| 522 |
+
# calculate the attack gradient from th_start_idx to th_end_idx
|
| 523 |
+
attack_gradient = (end_level - start_level) / attack_time
|
| 524 |
+
|
| 525 |
+
'''
|
| 526 |
+
More stuff to return if we want extra information to be displayed
|
| 527 |
+
'''
|
| 528 |
+
thresholds_to_return = [calculation_type, th_start_idx + min_pre_peak_idx, th_end_idx + min_pre_peak_idx,
|
| 529 |
+
threshold_idxs + min_pre_peak_idx]
|
| 530 |
+
|
| 531 |
+
elif calculation_type == 'fixed_threshold':
|
| 532 |
+
# set threshold values for fixed threshold method
|
| 533 |
+
fixed_threshold_start = 20
|
| 534 |
+
fixed_threshold_end = 90
|
| 535 |
+
|
| 536 |
+
# get dynamic range
|
| 537 |
+
dyn_range = max(envelope_samples) - min(envelope_samples)
|
| 538 |
+
|
| 539 |
+
# get thresholds relative to envelope level
|
| 540 |
+
lower_threshold = (fixed_threshold_start * dyn_range * 0.01) + min(envelope_samples)
|
| 541 |
+
upper_threshold = (fixed_threshold_end * dyn_range * 0.01) + min(envelope_samples)
|
| 542 |
+
|
| 543 |
+
# calculate start index
|
| 544 |
+
th_start_idx = np.argmax(envelope_samples >= lower_threshold)
|
| 545 |
+
# th_start_idx = th_start_idx[0]
|
| 546 |
+
|
| 547 |
+
# find the end idx after the start idx
|
| 548 |
+
th_end_idx = np.argmax(envelope_samples[th_start_idx:] >= upper_threshold)
|
| 549 |
+
th_end_idx = th_end_idx + th_start_idx
|
| 550 |
+
|
| 551 |
+
if th_start_idx == th_end_idx:
|
| 552 |
+
attack_time = 1.0 / fs
|
| 553 |
+
else:
|
| 554 |
+
attack_time = (th_end_idx - th_start_idx + 1.0) / fs
|
| 555 |
+
|
| 556 |
+
# compare attack time to maximum permissible attack time
|
| 557 |
+
if max_attack_time > 0:
|
| 558 |
+
if attack_time > max_attack_time:
|
| 559 |
+
# how many samples is equivalent to the maximum?
|
| 560 |
+
max_attack_time_sample = int(fs * max_attack_time) # convert to integer
|
| 561 |
+
th_end_idx = th_start_idx + max_attack_time_sample
|
| 562 |
+
attack_time = (th_end_idx - th_start_idx + 1.0) / fs
|
| 563 |
+
|
| 564 |
+
# calculate the gradient
|
| 565 |
+
|
| 566 |
+
# find the level of the first sample used
|
| 567 |
+
start_level = envelope_samples[int(th_start_idx)]
|
| 568 |
+
# find the level of the last sample used
|
| 569 |
+
end_level = envelope_samples[int(th_end_idx)]
|
| 570 |
+
|
| 571 |
+
# specify exceptions for a step functions crossing both thresholds
|
| 572 |
+
if start_level == end_level:
|
| 573 |
+
if th_start_idx > 0:
|
| 574 |
+
# if a previous sample is avaiable, take the previous starting sample
|
| 575 |
+
start_level = envelope_samples[int(th_start_idx) - 1]
|
| 576 |
+
else:
|
| 577 |
+
# set start level to zero if onset is at the first sample (indicating a step function at time zero)
|
| 578 |
+
start_level = 0.0
|
| 579 |
+
|
| 580 |
+
attack_gradient = (end_level - start_level) / attack_time
|
| 581 |
+
|
| 582 |
+
'''
|
| 583 |
+
More details to be returned if desired
|
| 584 |
+
'''
|
| 585 |
+
thresholds_to_return = [calculation_type, th_start_idx + min_pre_peak_idx, th_end_idx + min_pre_peak_idx]
|
| 586 |
+
|
| 587 |
+
else:
|
| 588 |
+
raise ValueError('calculation_type must be set to either \'fixed_threshold\' or \'min_effort\'.')
|
| 589 |
+
|
| 590 |
+
# convert attack time to logarithmic scale
|
| 591 |
+
attack_time = np.log10(attack_time)
|
| 592 |
+
|
| 593 |
+
# revert attack gradient metric if envelope has been normalised
|
| 594 |
+
if normalise:
|
| 595 |
+
attack_gradient *= normalise_factor
|
| 596 |
+
|
| 597 |
+
'''
|
| 598 |
+
Calculate the temporal centroid
|
| 599 |
+
'''
|
| 600 |
+
hold_env = envelope_samples[int(th_start_idx):int(th_end_idx) + 1]
|
| 601 |
+
t = np.arange(0, len(hold_env)) / float(fs)
|
| 602 |
+
temp_centroid = np.sum(t * hold_env) / np.sum(hold_env)
|
| 603 |
+
temp_centroid /= float(len(hold_env))
|
| 604 |
+
|
| 605 |
+
if return_descriptive_data:
|
| 606 |
+
return attack_time, attack_gradient, int(th_start_idx + min_pre_peak_idx), temp_centroid, thresholds_to_return
|
| 607 |
+
else:
|
| 608 |
+
return attack_time, attack_gradient, int(th_start_idx + min_pre_peak_idx), temp_centroid
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
def calculate_onsets(audio_samples, envelope_samples, fs, look_back_time=20, hysteresis_time=300, hysteresis_percent=10,
|
| 612 |
+
onset_in_noise_threshold=10, minimum_onset_time_separation=100, nperseg=512):
|
| 613 |
+
"""
|
| 614 |
+
Calculates the onset times using a look backwards recursive function to identify actual note onsets, and weights
|
| 615 |
+
the outputs based on the onset strength to avoid misidentifying onsets.
|
| 616 |
+
|
| 617 |
+
Required inputs
|
| 618 |
+
:param audio_samples: the audio file in the time domain.
|
| 619 |
+
:param envelope_samples: the envelope of the audio file, suggested to be calculated with
|
| 620 |
+
sample_and_hold_envelope_calculation.
|
| 621 |
+
:param fs: samplerate of the audio file. Function assumes the same sample rate for
|
| 622 |
+
both audio_samples and envelop_samples
|
| 623 |
+
|
| 624 |
+
Optional inputs
|
| 625 |
+
:param look_back_time: time in ms to recursively lookbackwards to identify start of onset,
|
| 626 |
+
defaults to 20ms.
|
| 627 |
+
:param hysteresis_time: time in ms to look backwards in time for a hysteresis check,
|
| 628 |
+
set to 300ms bedefault.
|
| 629 |
+
:param hysteresis_percent: set the percentage of dynamic range that must be checked when looking
|
| 630 |
+
backwards via hysteresis, default to 10%.
|
| 631 |
+
:param onset_in_noise_threshold: set a threshold of dynamic range for determining if an onset was variation
|
| 632 |
+
in noise or an actual onset, default to 10%.
|
| 633 |
+
:param minimum_onset_time_separation: set the minimum time in ms that two offsets can be separated by.
|
| 634 |
+
:param method: set the method for calculating the onsets. Default to 'librosa', but can
|
| 635 |
+
be 'essentia_hfc', or 'essentia_complex'.
|
| 636 |
+
:param nperseg: value used in return loop.
|
| 637 |
+
|
| 638 |
+
:return: thresholded onsets, returns [0] if no onsets are identified. Note that a
|
| 639 |
+
value of [0] is also possible during normal opperation.
|
| 640 |
+
"""
|
| 641 |
+
# get onsets with librosa estimation
|
| 642 |
+
onsets = librosa.onset.onset_detect(y=audio_samples, sr=fs, backtrack=True, units='samples')
|
| 643 |
+
|
| 644 |
+
# set values for return_loop method
|
| 645 |
+
time_thresh = int(look_back_time * 0.001 * fs) # 10 ms default look-back time, in samples
|
| 646 |
+
hysteresis_samples = int(hysteresis_time * fs * 0.001) # hysteresis time, in samples
|
| 647 |
+
envelope_dyn_range = max(envelope_samples) - min(envelope_samples)
|
| 648 |
+
hysteresis_thresh = envelope_dyn_range * hysteresis_percent * 0.01
|
| 649 |
+
|
| 650 |
+
# only conduct analysis if there are onsets detected
|
| 651 |
+
if np.size(onsets):
|
| 652 |
+
# empty array for storing exact onset idxs
|
| 653 |
+
corrected_onsets = []
|
| 654 |
+
|
| 655 |
+
for onset_idx in onsets:
|
| 656 |
+
# if the onset is 1 or 0, it's too close to the start to be corrected (1 is here due to zero padding)
|
| 657 |
+
if onset_idx > 0:
|
| 658 |
+
# actual onset location in samples (librosa uses 512 window size by default)
|
| 659 |
+
onset_loc = np.array(onset_idx).astype('int')
|
| 660 |
+
|
| 661 |
+
# only calculate if the onset is NOT at the end of the file, whilst other onsets exist.
|
| 662 |
+
# If the only onset is at the end, calculate anyway.
|
| 663 |
+
if not corrected_onsets:
|
| 664 |
+
onset_hold = return_loop(onset_loc, envelope_samples, time_thresh, hysteresis_thresh,
|
| 665 |
+
hysteresis_samples, nperseg=nperseg)
|
| 666 |
+
corrected_onsets.append(onset_hold)
|
| 667 |
+
else:
|
| 668 |
+
if (onset_loc + 511) < len(envelope_samples):
|
| 669 |
+
onset_hold = return_loop(onset_loc, envelope_samples, time_thresh, hysteresis_thresh,
|
| 670 |
+
hysteresis_samples, nperseg=nperseg)
|
| 671 |
+
corrected_onsets.append(onset_hold)
|
| 672 |
+
else:
|
| 673 |
+
corrected_onsets.append(0)
|
| 674 |
+
|
| 675 |
+
# zero is returned from return_loop if no valid onset identified
|
| 676 |
+
# remove zeros (except the first)
|
| 677 |
+
zero_loc = np.where(np.array(corrected_onsets) == 0)[0]
|
| 678 |
+
# ignore if the first value is zero
|
| 679 |
+
if list(zero_loc):
|
| 680 |
+
if zero_loc[0] == 0:
|
| 681 |
+
zero_loc = zero_loc[1:]
|
| 682 |
+
corrected_onsets = np.delete(corrected_onsets, zero_loc)
|
| 683 |
+
|
| 684 |
+
# remove duplicates
|
| 685 |
+
hold_onsets = []
|
| 686 |
+
for i in corrected_onsets:
|
| 687 |
+
if i not in hold_onsets:
|
| 688 |
+
hold_onsets.append(i)
|
| 689 |
+
corrected_onsets = hold_onsets
|
| 690 |
+
|
| 691 |
+
'''
|
| 692 |
+
Remove repeated onsets and compare onset segments against the dynamic range
|
| 693 |
+
to remove erroneous onsets in noise. If the onset segment (samples between
|
| 694 |
+
adjacent onsets) has a dynamic range less than 10% of total dynamic range,
|
| 695 |
+
remove this onset.
|
| 696 |
+
'''
|
| 697 |
+
if len(corrected_onsets) > 1:
|
| 698 |
+
thd_corrected_onsets = []
|
| 699 |
+
last_value = corrected_onsets[-1]
|
| 700 |
+
threshold = onset_in_noise_threshold * envelope_dyn_range * 0.01
|
| 701 |
+
|
| 702 |
+
for i in reversed(range(len(corrected_onsets))):
|
| 703 |
+
if corrected_onsets[i] == corrected_onsets[-1]:
|
| 704 |
+
segment = envelope_samples[corrected_onsets[i]:]
|
| 705 |
+
else:
|
| 706 |
+
segment = envelope_samples[corrected_onsets[i]:corrected_onsets[i + 1]]
|
| 707 |
+
|
| 708 |
+
# only conduct if the segment if greater than 1 sample long
|
| 709 |
+
if len(segment) > 1:
|
| 710 |
+
# find attack portion SNR
|
| 711 |
+
peak_idx = np.argmax(segment)
|
| 712 |
+
if peak_idx > 0:
|
| 713 |
+
# get the dynamic range of the attack portion
|
| 714 |
+
seg_dyn_range = max(segment) - min(segment[:peak_idx])
|
| 715 |
+
if seg_dyn_range >= threshold:
|
| 716 |
+
pass
|
| 717 |
+
else:
|
| 718 |
+
corrected_onsets = np.delete(corrected_onsets, i)
|
| 719 |
+
else:
|
| 720 |
+
corrected_onsets = np.delete(corrected_onsets, i)
|
| 721 |
+
else:
|
| 722 |
+
corrected_onsets = np.delete(corrected_onsets, i)
|
| 723 |
+
|
| 724 |
+
# remove onsets that are too close together, favouring the earlier onset
|
| 725 |
+
if len(corrected_onsets) > 1:
|
| 726 |
+
minimum_onset_time_separation_samples = fs * 0.001 * minimum_onset_time_separation
|
| 727 |
+
time_separation = np.diff(corrected_onsets)
|
| 728 |
+
# while loop for potential multiple itterations
|
| 729 |
+
while len(corrected_onsets) > 1 and min(time_separation) < minimum_onset_time_separation_samples:
|
| 730 |
+
onsets_to_remove = []
|
| 731 |
+
# some onsets are closer together than the minimum value
|
| 732 |
+
for i in range(len(corrected_onsets)-1):
|
| 733 |
+
# are the last two onsets too close?
|
| 734 |
+
if abs(corrected_onsets[i+1] - corrected_onsets[i]) < minimum_onset_time_separation_samples:
|
| 735 |
+
onsets_to_remove.append(i+1)
|
| 736 |
+
|
| 737 |
+
# remove onsets too close together
|
| 738 |
+
corrected_onsets = np.delete(corrected_onsets, onsets_to_remove)
|
| 739 |
+
time_separation = np.diff(corrected_onsets)
|
| 740 |
+
|
| 741 |
+
'''
|
| 742 |
+
Correct onsets by comparing to the onset strength.
|
| 743 |
+
|
| 744 |
+
If there in an onset strength of 3 or greater between two onsets, then the onset if valid.
|
| 745 |
+
Otherwise, discard the onset.
|
| 746 |
+
'''
|
| 747 |
+
thd_corrected_onsets = []
|
| 748 |
+
|
| 749 |
+
# get the onset strength
|
| 750 |
+
onset_strength = librosa.onset.onset_strength(y=audio_samples, sr=fs)
|
| 751 |
+
|
| 752 |
+
strength_onset_times = np.array(np.array(corrected_onsets) / 512).astype('int')
|
| 753 |
+
strength_onset_times.clip(min=0)
|
| 754 |
+
|
| 755 |
+
corrected_original_onsets = []
|
| 756 |
+
corrected_strength_onsets = []
|
| 757 |
+
for onset_idx in reversed(range(len(corrected_onsets))):
|
| 758 |
+
current_strength_onset = strength_onset_times[onset_idx]
|
| 759 |
+
if current_strength_onset == strength_onset_times[-1]:
|
| 760 |
+
onset_strength_seg = onset_strength[current_strength_onset:]
|
| 761 |
+
else:
|
| 762 |
+
onset_strength_seg = onset_strength[current_strength_onset:strength_onset_times[onset_idx + 1]]
|
| 763 |
+
|
| 764 |
+
if max(onset_strength_seg) < 3:
|
| 765 |
+
strength_onset_times = np.delete(strength_onset_times, onset_idx)
|
| 766 |
+
else:
|
| 767 |
+
thd_corrected_onsets.append(corrected_onsets[onset_idx])
|
| 768 |
+
|
| 769 |
+
else:
|
| 770 |
+
return [0]
|
| 771 |
+
|
| 772 |
+
thd_corrected_onsets.sort()
|
| 773 |
+
if thd_corrected_onsets:
|
| 774 |
+
return thd_corrected_onsets
|
| 775 |
+
else:
|
| 776 |
+
return [0]
|
| 777 |
+
|
| 778 |
+
|
| 779 |
+
def get_bandwidth_array(audio_samples, fs, nperseg=512, overlap_step=32, rolloff_thresh=0.01,
|
| 780 |
+
rollon_thresh_percent=0.05, log_bandwidth=False, return_centroid=False,
|
| 781 |
+
low_bandwidth_method='Percentile', normalisation_method='RMS_Time_Window'):
|
| 782 |
+
"""
|
| 783 |
+
Calculate the bandwidth array estimate for an audio signal.
|
| 784 |
+
|
| 785 |
+
Required inputs
|
| 786 |
+
:param audio_samples: array of the audio samples
|
| 787 |
+
:param fs: samplerate of the audio samples
|
| 788 |
+
|
| 789 |
+
Optional inputs
|
| 790 |
+
:param nperseg: numper of samples used for calculating spectrogram
|
| 791 |
+
:param overlap_step: number of samples overlap for calculating spectrogram
|
| 792 |
+
:param rolloff_thresh: threshold value for calculating rolloff frequency
|
| 793 |
+
:param rollon_thresh_percent: percentage threshold for calculating rollon frequency
|
| 794 |
+
:param log_bandwidth: return the logarithm of the bandwdith, default to False
|
| 795 |
+
:param return_centroid: return the centroid for each time window
|
| 796 |
+
:param low_bandwidth_method: method for calculating the low frequency limit of the bandwidth,
|
| 797 |
+
default to 'Percentile'
|
| 798 |
+
:param normalisation_method: method for normlaising the spectrogram, default to 'RMS_Time_Window'
|
| 799 |
+
|
| 800 |
+
:return: returns the bandwidth array, time array (from spectrogram), and
|
| 801 |
+
frequency array (from spectrogram).
|
| 802 |
+
"""
|
| 803 |
+
noverlap = nperseg - overlap_step
|
| 804 |
+
# get spectrogram
|
| 805 |
+
f, t, spec = spectrogram(audio_samples, fs, window='boxcar', nperseg=nperseg, noverlap=noverlap, scaling='density',
|
| 806 |
+
mode='magnitude')
|
| 807 |
+
|
| 808 |
+
# normalise the spectrogram
|
| 809 |
+
if normalisation_method == 'Single_TF_Bin':
|
| 810 |
+
spec /= np.max(spec)
|
| 811 |
+
elif normalisation_method == 'RMS_Time_Window':
|
| 812 |
+
spec /= np.max(np.sqrt(np.sum(spec * spec, axis=0)))
|
| 813 |
+
elif normalisation_method == "none":
|
| 814 |
+
pass
|
| 815 |
+
else:
|
| 816 |
+
raise ValueError('Bandwidth normalisation method must be \'Single_TF_Bin\' or \'RMS_Time_Window\'')
|
| 817 |
+
|
| 818 |
+
# get values for thresholding
|
| 819 |
+
level_with_time = np.sum(spec, axis=0)
|
| 820 |
+
max_l = np.max(level_with_time)
|
| 821 |
+
min_l = np.min(level_with_time)
|
| 822 |
+
min_tpower = (0.1 * (max_l - min_l)) + min_l
|
| 823 |
+
|
| 824 |
+
|
| 825 |
+
# initialise lists for storage
|
| 826 |
+
rollon = []
|
| 827 |
+
rolloff = []
|
| 828 |
+
bandwidth = []
|
| 829 |
+
centroid = []
|
| 830 |
+
centroid_power = []
|
| 831 |
+
|
| 832 |
+
# calculate the bandwidth curve
|
| 833 |
+
for time_count in range(len(t)):
|
| 834 |
+
seg = spec[:, time_count]
|
| 835 |
+
tpower = np.sum(seg)
|
| 836 |
+
if tpower > min_tpower:
|
| 837 |
+
if low_bandwidth_method == 'Percentile':
|
| 838 |
+
# get the spectral rollon
|
| 839 |
+
rollon_counter = 1
|
| 840 |
+
cumulative_power = np.sum(seg[:rollon_counter])
|
| 841 |
+
rollon_thresh = tpower * rollon_thresh_percent
|
| 842 |
+
|
| 843 |
+
while cumulative_power < rollon_thresh:
|
| 844 |
+
rollon_counter += 1
|
| 845 |
+
cumulative_power = np.sum(seg[:rollon_counter])
|
| 846 |
+
rollon.append(f[rollon_counter - 1])
|
| 847 |
+
elif low_bandwidth_method == 'Cutoff':
|
| 848 |
+
rollon_idx = np.where(seg >= rolloff_thresh)[0]
|
| 849 |
+
if len(rollon_idx):
|
| 850 |
+
rollon_idx = rollon_idx[0]
|
| 851 |
+
rollon.append(f[rollon_idx])
|
| 852 |
+
else:
|
| 853 |
+
raise ValueError('low_bandwidth_method must be \'Percentile\' or \'Cutoff\'')
|
| 854 |
+
|
| 855 |
+
# get the spectral rolloff
|
| 856 |
+
rolloff_idx = np.where(seg >= rolloff_thresh)[0]
|
| 857 |
+
if len(rolloff_idx):
|
| 858 |
+
rolloff_idx = rolloff_idx[-1]
|
| 859 |
+
rolloff.append(f[rolloff_idx])
|
| 860 |
+
if log_bandwidth:
|
| 861 |
+
bandwidth.append(np.log(f[rolloff_idx] / float(f[rollon_counter - 1])))
|
| 862 |
+
else:
|
| 863 |
+
bandwidth.append(f[rolloff_idx] - f[rollon_counter - 1])
|
| 864 |
+
else:
|
| 865 |
+
bandwidth.append(0)
|
| 866 |
+
|
| 867 |
+
# get centroid values
|
| 868 |
+
centroid.append(np.sum(seg * f) / np.sum(seg))
|
| 869 |
+
centroid_power.append(tpower)
|
| 870 |
+
else:
|
| 871 |
+
bandwidth.append(0)
|
| 872 |
+
|
| 873 |
+
if return_centroid:
|
| 874 |
+
return bandwidth, t, f, np.average(centroid, weights=centroid_power)
|
| 875 |
+
else:
|
| 876 |
+
return bandwidth, t, f
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def calculate_bandwidth_gradient(bandwidth_segment, t):
|
| 880 |
+
"""
|
| 881 |
+
Calculate the gradient ferom the bandwidth array
|
| 882 |
+
|
| 883 |
+
:param bandwidth_segment: segment of bandwdith for calculation
|
| 884 |
+
:param t: time base for calculating
|
| 885 |
+
|
| 886 |
+
:return: gradient of the bandwidth
|
| 887 |
+
"""
|
| 888 |
+
if bandwidth_segment:
|
| 889 |
+
max_idx = np.argmax(bandwidth_segment)
|
| 890 |
+
if max_idx > 0:
|
| 891 |
+
min_idx = np.where(np.array(bandwidth_segment[:max_idx]) == min(bandwidth_segment[:max_idx]))[0][-1]
|
| 892 |
+
|
| 893 |
+
bandwidth_change = bandwidth_segment[max_idx] - bandwidth_segment[min_idx]
|
| 894 |
+
time_to_change = (max_idx - min_idx) * (t[1] - t[0])
|
| 895 |
+
|
| 896 |
+
bandwidth_gradient = bandwidth_change / time_to_change
|
| 897 |
+
else:
|
| 898 |
+
bandwidth_gradient = False
|
| 899 |
+
else:
|
| 900 |
+
bandwidth_gradient = False
|
| 901 |
+
return bandwidth_gradient
|
| 902 |
+
|
| 903 |
+
|
| 904 |
+
def calculate_rms_enveope(audio_samples, step_size=256, overlap_step=256, normalise=True):
|
| 905 |
+
"""
|
| 906 |
+
Calculate the RMS envelope of the audio signal.
|
| 907 |
+
|
| 908 |
+
:param audio_samples: numpy array, the audio samples.
|
| 909 |
+
:param step_size: int, number of samples to get the RMS from.
|
| 910 |
+
:param overlap_step: int, number of samples to overlap.
|
| 911 |
+
|
| 912 |
+
:return: RMS array
|
| 913 |
+
"""
|
| 914 |
+
# initialise lists and counters
|
| 915 |
+
rms_envelope = []
|
| 916 |
+
i = 0
|
| 917 |
+
t_hold = []
|
| 918 |
+
# step through the signal
|
| 919 |
+
while i < len(audio_samples) - step_size:
|
| 920 |
+
rms_envelope.append(np.sqrt(np.mean(audio_samples[i:i + step_size] * audio_samples[i:i + step_size])))
|
| 921 |
+
i += overlap_step
|
| 922 |
+
|
| 923 |
+
# use the remainder of the array for a final sample
|
| 924 |
+
t_hold.append(i)
|
| 925 |
+
rms_envelope.append(np.sqrt(np.mean(audio_samples[i:] * audio_samples[i:])))
|
| 926 |
+
rms_envelope = np.array(rms_envelope)
|
| 927 |
+
|
| 928 |
+
# normalise to peak value
|
| 929 |
+
if normalise:
|
| 930 |
+
rms_envelope = rms_envelope * (1.0 / max(abs(rms_envelope)))
|
| 931 |
+
|
| 932 |
+
return rms_envelope
|
| 933 |
+
|
| 934 |
+
|
| 935 |
+
def detect_peaks(array, freq=0, cthr=0.2, unprocessed_array=False, fs=44100):
|
| 936 |
+
"""
|
| 937 |
+
Function detects the peaks in array, based from the mirpeaks algorithm.
|
| 938 |
+
|
| 939 |
+
:param array: Array in which to detect peaks
|
| 940 |
+
:param freq: Scale representing the x axis (sample length as array)
|
| 941 |
+
:param cthr: Threshold for checking adjacent peaks
|
| 942 |
+
:param unprocessed_array: Array that in unprocessed (normalised), if False will default to the same as array.
|
| 943 |
+
:param fs: Sampe rate of the array
|
| 944 |
+
|
| 945 |
+
:return: index of peaks, values of peaks, peak value on freq.
|
| 946 |
+
"""
|
| 947 |
+
# flatten the array for correct processing
|
| 948 |
+
array = array.flatten()
|
| 949 |
+
|
| 950 |
+
if np.isscalar(freq):
|
| 951 |
+
# calculate the frerquency scale - assuming a samplerate if none provided
|
| 952 |
+
freq = np.linspace(0, fs/2.0, len(array))
|
| 953 |
+
|
| 954 |
+
if np.isscalar(unprocessed_array):
|
| 955 |
+
unprocessed_array = array
|
| 956 |
+
|
| 957 |
+
# add values to allow peaks at the first and last values
|
| 958 |
+
array_appended = np.insert(array, [0, len(array)], -2.0) # to allow peaks at start and end (default of mir)
|
| 959 |
+
# unprocessed array to get peak values
|
| 960 |
+
array_unprocess_appended = np.insert(unprocessed_array, [0, len(unprocessed_array)], -2.0)
|
| 961 |
+
# append the frequency scale for precise freq calculation
|
| 962 |
+
freq_appended = np.insert(freq, [0, len(freq)], -1.0)
|
| 963 |
+
|
| 964 |
+
# get the difference values
|
| 965 |
+
diff_array = np.diff(array_appended)
|
| 966 |
+
|
| 967 |
+
# find local maxima
|
| 968 |
+
mx = np.array(np.where((array >= cthr) & (diff_array[0:-1] > 0) & (diff_array[1:] <= 0))) + 1
|
| 969 |
+
|
| 970 |
+
# initialise arrays for output
|
| 971 |
+
finalmx = []
|
| 972 |
+
peak_value = []
|
| 973 |
+
peak_x = []
|
| 974 |
+
peak_idx = []
|
| 975 |
+
|
| 976 |
+
if np.size(mx) > 0:
|
| 977 |
+
# unpack the array if peaks found
|
| 978 |
+
mx = mx[0]
|
| 979 |
+
|
| 980 |
+
j = 0 # scans the peaks from beginning to end
|
| 981 |
+
mxj = mx[j] # the current peak under evaluation
|
| 982 |
+
jj = j + 1
|
| 983 |
+
bufmin = 2.0
|
| 984 |
+
bufmax = array_appended[mxj]
|
| 985 |
+
|
| 986 |
+
if mxj > 1:
|
| 987 |
+
oldbufmin = min(array_appended[:mxj-1])
|
| 988 |
+
else:
|
| 989 |
+
oldbufmin = array_appended[0]
|
| 990 |
+
|
| 991 |
+
while jj < len(mx):
|
| 992 |
+
# if adjacent mx values are too close, returns no array
|
| 993 |
+
if mx[jj-1]+1 == mx[jj]-1:
|
| 994 |
+
bufmin = min([bufmin, array_appended[mx[jj-1]]])
|
| 995 |
+
else:
|
| 996 |
+
bufmin = min([bufmin, min(array_appended[mx[jj-1]:mx[jj]-1])])
|
| 997 |
+
|
| 998 |
+
if bufmax - bufmin < cthr:
|
| 999 |
+
# There is no contrastive notch
|
| 1000 |
+
if array_appended[mx[jj]] > bufmax:
|
| 1001 |
+
# new peak is significant;y higher than the old peak,
|
| 1002 |
+
# the peak is transfered to the new position
|
| 1003 |
+
j = jj
|
| 1004 |
+
mxj = mx[j] # the current peak
|
| 1005 |
+
bufmax = array_appended[mxj]
|
| 1006 |
+
oldbufmin = min([oldbufmin, bufmin])
|
| 1007 |
+
bufmin = 2.0
|
| 1008 |
+
elif array_appended[mx[jj]] - bufmax <= 0:
|
| 1009 |
+
bufmax = max([bufmax, array_appended[mx[jj]]])
|
| 1010 |
+
oldbufmin = min([oldbufmin, bufmin])
|
| 1011 |
+
|
| 1012 |
+
else:
|
| 1013 |
+
# There is a contrastive notch
|
| 1014 |
+
if bufmax - oldbufmin < cthr:
|
| 1015 |
+
# But the previous peak candidate is too weak and therefore discarded
|
| 1016 |
+
oldbufmin = min([oldbufmin, bufmin])
|
| 1017 |
+
else:
|
| 1018 |
+
# The previous peak candidate is OK and therefore stored
|
| 1019 |
+
finalmx.append(mxj)
|
| 1020 |
+
oldbufmin = bufmin
|
| 1021 |
+
|
| 1022 |
+
bufmax = array_appended[mx[jj]]
|
| 1023 |
+
j = jj
|
| 1024 |
+
mxj = mx[j] # The current peak
|
| 1025 |
+
bufmin = 2.0
|
| 1026 |
+
|
| 1027 |
+
jj += 1
|
| 1028 |
+
if bufmax - oldbufmin >= cthr and (bufmax - min(array_appended[mx[j] + 1:]) >= cthr):
|
| 1029 |
+
# The last peak candidate is OK and stored
|
| 1030 |
+
finalmx.append(mx[j])
|
| 1031 |
+
|
| 1032 |
+
''' Sort the values according to their level '''
|
| 1033 |
+
finalmx = np.array(finalmx)
|
| 1034 |
+
sort_idx = np.argsort(array_appended[finalmx])[::-1] # descending sort
|
| 1035 |
+
finalmx = finalmx[sort_idx]
|
| 1036 |
+
|
| 1037 |
+
peak_idx = finalmx - 1 # indexes were for the appended array, -1 to return to original array index
|
| 1038 |
+
peak_value = array_unprocess_appended[finalmx]
|
| 1039 |
+
peak_x = freq_appended[finalmx]
|
| 1040 |
+
|
| 1041 |
+
''' Interpolation for more precise peak location '''
|
| 1042 |
+
corrected_value = []
|
| 1043 |
+
corrected_position = []
|
| 1044 |
+
for current_peak_idx in finalmx:
|
| 1045 |
+
# if there enough space to do the fitting
|
| 1046 |
+
if 1 < current_peak_idx < (len(array_unprocess_appended) - 2):
|
| 1047 |
+
y0 = array_unprocess_appended[current_peak_idx]
|
| 1048 |
+
ym = array_unprocess_appended[current_peak_idx-1]
|
| 1049 |
+
yp = array_unprocess_appended[current_peak_idx+1]
|
| 1050 |
+
p = (yp - ym) / (2 * (2*y0 - yp - ym))
|
| 1051 |
+
corrected_value.append(y0 - (0.25*(ym-yp)*p))
|
| 1052 |
+
if p >= 0:
|
| 1053 |
+
correct_pos = ((1 - p) * freq_appended[current_peak_idx]) + (p * freq_appended[current_peak_idx+1])
|
| 1054 |
+
corrected_position.append(correct_pos)
|
| 1055 |
+
elif p < 0:
|
| 1056 |
+
correct_pos = ((1 + p) * freq_appended[current_peak_idx]) - (p * freq_appended[current_peak_idx-1])
|
| 1057 |
+
corrected_position.append(correct_pos)
|
| 1058 |
+
else:
|
| 1059 |
+
corrected_value.append(array_unprocess_appended[current_peak_idx])
|
| 1060 |
+
corrected_position.append(freq_appended[current_peak_idx])
|
| 1061 |
+
|
| 1062 |
+
if corrected_position:
|
| 1063 |
+
peak_x = corrected_position
|
| 1064 |
+
peak_value = corrected_value
|
| 1065 |
+
|
| 1066 |
+
return peak_idx, peak_value, peak_x
|
| 1067 |
+
|
| 1068 |
+
|
| 1069 |
+
def sigmoid(x, offset=0.2, n=10):
|
| 1070 |
+
# return a sigmoidal function for weighting values
|
| 1071 |
+
return x ** n / (x ** n + offset)
|
| 1072 |
+
|
| 1073 |
+
|
| 1074 |
+
def channel_reduction(audio_samples, phase_correction=False):
|
| 1075 |
+
"""
|
| 1076 |
+
Algorithm for reducing the number of channels in a read-in audio file
|
| 1077 |
+
|
| 1078 |
+
:param audio_samples: audio samples
|
| 1079 |
+
:param phase_correction: perform phase checking on channels before mono sum
|
| 1080 |
+
|
| 1081 |
+
:return: audio samples summed to mono
|
| 1082 |
+
"""
|
| 1083 |
+
# get sum all channels to mono
|
| 1084 |
+
num_channels = np.shape(audio_samples)
|
| 1085 |
+
if len(num_channels) > 1:
|
| 1086 |
+
# check for stereo file
|
| 1087 |
+
if num_channels[1] == 2:
|
| 1088 |
+
# crudely check for out of phase signals
|
| 1089 |
+
if phase_correction:
|
| 1090 |
+
r, pval = scipy.stats.pearsonr(audio_samples[:, 0], audio_samples[:, 1])
|
| 1091 |
+
if r < -0.5:
|
| 1092 |
+
audio_samples = audio_samples[:, 0] # [:,1] *= -1.0
|
| 1093 |
+
else:
|
| 1094 |
+
audio_samples = np.sum(audio_samples, axis=1)
|
| 1095 |
+
else:
|
| 1096 |
+
audio_samples = np.sum(audio_samples, axis=1)
|
| 1097 |
+
# check for multi-channel file
|
| 1098 |
+
elif num_channels[1] > 2:
|
| 1099 |
+
# there are multiple layouts for multichannel, I have no way of decoding these with soundfile
|
| 1100 |
+
audio_samples = np.sum(audio_samples[:,0:3], axis=1)
|
| 1101 |
+
|
| 1102 |
+
#TODO Update to include multichannel variants and decode according to: http://www.atsc.org/wp-content/uploads/2015/03/A52-201212-17.pdf
|
| 1103 |
+
# elif num_channels[3] > 4:
|
| 1104 |
+
# elif num_channels[3] > 5:
|
| 1105 |
+
# elif num_channels[3] > 6:
|
| 1106 |
+
|
| 1107 |
+
return audio_samples
|
| 1108 |
+
|
| 1109 |
+
|
| 1110 |
+
def spectral_flux(spectrogram, method='sum'):
|
| 1111 |
+
"""
|
| 1112 |
+
This computes the spectral flux: the difference between sucesive spectrogram time frames
|
| 1113 |
+
|
| 1114 |
+
:param spectrogram:
|
| 1115 |
+
:return:
|
| 1116 |
+
"""
|
| 1117 |
+
if method == 'sum':
|
| 1118 |
+
# sum method
|
| 1119 |
+
diff_spec = np.diff(spectrogram, axis=1) # difference
|
| 1120 |
+
sum_flux = np.sqrt(np.sum(diff_spec**2, axis=0))/float(diff_spec.shape[0]);
|
| 1121 |
+
|
| 1122 |
+
return sum_flux
|
| 1123 |
+
|
| 1124 |
+
elif method == 'multiply':
|
| 1125 |
+
# multiplication between adjacent frames
|
| 1126 |
+
diff_spec = spectrogram[:,:-1] * spectrogram[:,1:]
|
| 1127 |
+
sum_diff_spec = (np.sum(diff_spec ** 2.0, axis=0)) # variation acorss time
|
| 1128 |
+
orig_spec_var = np.sum(spectrogram[:,:-1]** 2.0, axis=0)
|
| 1129 |
+
delayed_spec_var = np.sum(spectrogram[:,1:]** 2.0, axis=0)
|
| 1130 |
+
denom = orig_spec_var * delayed_spec_var
|
| 1131 |
+
|
| 1132 |
+
multiply_flux = np.nan_to_num(1 - sum_diff_spec / (orig_spec_var * delayed_spec_var))
|
| 1133 |
+
|
| 1134 |
+
return multiply_flux
|
| 1135 |
+
|
| 1136 |
+
|
| 1137 |
+
def log_sum(array):
|
| 1138 |
+
"""
|
| 1139 |
+
This function calculates the log sum of an array
|
| 1140 |
+
|
| 1141 |
+
:param array:
|
| 1142 |
+
:return:
|
| 1143 |
+
"""
|
| 1144 |
+
logsum = 10 * np.log10(np.sum(10 ** (array / 10.0)))
|
| 1145 |
+
|
| 1146 |
+
return logsum
|
| 1147 |
+
|
| 1148 |
+
|
| 1149 |
+
def filter_design2(Fc, fs, N):
|
| 1150 |
+
"""
|
| 1151 |
+
Design Butterworth 2nd-order one-third-octave filter.
|
| 1152 |
+
"""
|
| 1153 |
+
|
| 1154 |
+
f1 = (2.0 ** (-1.0/6)) * Fc
|
| 1155 |
+
f2 = (2.0 ** (1.0/6)) * Fc
|
| 1156 |
+
f1 = f1 / (fs / 2.0)
|
| 1157 |
+
f2 = f2 / (fs / 2.0)
|
| 1158 |
+
|
| 1159 |
+
# force f2 to be 1.0 for cases where the upper bandwidth from 3rd_octave_downsample produce higher frequencies
|
| 1160 |
+
if f2 >= 1.0:
|
| 1161 |
+
f2 = 0.9999999999
|
| 1162 |
+
b, a = scipy.signal.butter(N, [f1, f2], 'bandpass')
|
| 1163 |
+
return b, a
|
| 1164 |
+
|
| 1165 |
+
|
| 1166 |
+
def midbands(Fmin, Fmax, fs):
|
| 1167 |
+
"""
|
| 1168 |
+
Divides the frequency range into third octave bands using filters
|
| 1169 |
+
Fmin is the minimum third octave band
|
| 1170 |
+
Fmax is the maximum third octave band
|
| 1171 |
+
"""
|
| 1172 |
+
|
| 1173 |
+
# set defaults
|
| 1174 |
+
lowest_band = 25
|
| 1175 |
+
highest_band = 20000
|
| 1176 |
+
Nyquist_frequency = fs / 2.0
|
| 1177 |
+
FUpper = (2 ** (1/6.0)) * Fmax
|
| 1178 |
+
|
| 1179 |
+
fr = 1000 # reference frequency is 1000Hz
|
| 1180 |
+
i = np.arange(-16, 14, 1)
|
| 1181 |
+
lab_freq = np.array([25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600,
|
| 1182 |
+
2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500, 16000, 20000])
|
| 1183 |
+
|
| 1184 |
+
A = np.where(lab_freq == Fmin)[0][0]
|
| 1185 |
+
B = np.where(lab_freq == Fmax)[0][0]
|
| 1186 |
+
|
| 1187 |
+
# compare value of B to nyquist
|
| 1188 |
+
while lab_freq[B] > Nyquist_frequency:
|
| 1189 |
+
B -= 1
|
| 1190 |
+
|
| 1191 |
+
|
| 1192 |
+
j = i[np.arange(A, B+1, 1)] # indices to find exact midband frequencies
|
| 1193 |
+
ff = (2.0 ** (j / 3.0)) * fr # Exact midband frequencies (Calculated as base two exact)
|
| 1194 |
+
F = lab_freq[np.arange(A, B+1, 1)]
|
| 1195 |
+
return ff, F, j
|
| 1196 |
+
|
| 1197 |
+
|
| 1198 |
+
def filter_third_octaves_downsample(x, Pref, fs, Fmin, Fmax, N):
|
| 1199 |
+
"""
|
| 1200 |
+
Filters the audio file into thrid octave bands
|
| 1201 |
+
x is the file (Input length must be a multiple of 2^8)
|
| 1202 |
+
Pref is the reference level for calculating decibels - does not allow for negative values
|
| 1203 |
+
Fmin is the minimum frequency
|
| 1204 |
+
Fmax is the maximum frequency (must be at least 2500 Hz)
|
| 1205 |
+
Fs is the sampling frequency
|
| 1206 |
+
N is the filter order
|
| 1207 |
+
"""
|
| 1208 |
+
# identify midband frequencies
|
| 1209 |
+
[ff, F, j] = midbands(Fmin, Fmax, fs)
|
| 1210 |
+
|
| 1211 |
+
# apply filters
|
| 1212 |
+
P = np.zeros(len(j))
|
| 1213 |
+
k = np.where(j == 7)[0][0] # Determines where downsampling will commence (5000 Hz and below)
|
| 1214 |
+
m = len(x)
|
| 1215 |
+
|
| 1216 |
+
# For frequencies of 6300 Hz or higher, direct implementation of filters.
|
| 1217 |
+
for i in range(len(j)-1, k, -1):
|
| 1218 |
+
B, A = filter_design2(ff[i], fs, N)
|
| 1219 |
+
if i==k+3: # Upper 1/3-oct. band in last octave.
|
| 1220 |
+
Bu = B;
|
| 1221 |
+
Au = A;
|
| 1222 |
+
if i == k + 2: # Center 1/3-oct. band in last octave.
|
| 1223 |
+
Bc = B;
|
| 1224 |
+
Ac = A;
|
| 1225 |
+
if i == k + 1: # Lower 1/3-oct. band in last octave.
|
| 1226 |
+
Bl = B;
|
| 1227 |
+
Al = A;
|
| 1228 |
+
y = scipy.signal.lfilter(B, A, x);
|
| 1229 |
+
if np.max(y) > 0:
|
| 1230 |
+
P[i] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0) / m)) # Convert to decibels.
|
| 1231 |
+
else:
|
| 1232 |
+
P[i] = -1.0 * np.inf
|
| 1233 |
+
|
| 1234 |
+
# 5000 Hz or lower, multirate filter implementation.
|
| 1235 |
+
try:
|
| 1236 |
+
for i in range(k, 1, -3): #= k:-3:1;
|
| 1237 |
+
# Design anti-aliasing filter (IIR Filter)
|
| 1238 |
+
Wn = 0.4
|
| 1239 |
+
C, D = scipy.signal.cheby1(2, 0.1, Wn)
|
| 1240 |
+
# Filter
|
| 1241 |
+
x = scipy.signal.lfilter(C, D, x)
|
| 1242 |
+
# Downsample
|
| 1243 |
+
idx = np.arange(1, len(x), 2)
|
| 1244 |
+
x = x[idx]
|
| 1245 |
+
fs = fs / 2.0
|
| 1246 |
+
m = len(x)
|
| 1247 |
+
# Performs the filtering
|
| 1248 |
+
y = scipy.signal.lfilter(Bu, Au, x)
|
| 1249 |
+
if np.max(y) > 0:
|
| 1250 |
+
P[i] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0)/m))
|
| 1251 |
+
else:
|
| 1252 |
+
P[i] = -1.0 * np.inf
|
| 1253 |
+
y = scipy.signal.lfilter(Bc, Ac, x)
|
| 1254 |
+
if np.max(y) > 0:
|
| 1255 |
+
P[i-1] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0)/m))
|
| 1256 |
+
else:
|
| 1257 |
+
P[i-1] = -1.0 * np.inf
|
| 1258 |
+
y = scipy.signal.lfilter(Bl, Al, x)
|
| 1259 |
+
if np.max(y) > 0:
|
| 1260 |
+
P[i - 2] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0) / m))
|
| 1261 |
+
else:
|
| 1262 |
+
P[i-2] = -1.0 * np.inf
|
| 1263 |
+
except:
|
| 1264 |
+
P = P[1:len(j)]
|
| 1265 |
+
|
| 1266 |
+
# "calibrate" the readings based from Pref, chosen as 100 in most uses
|
| 1267 |
+
P = P + Pref
|
| 1268 |
+
|
| 1269 |
+
# log transformation
|
| 1270 |
+
Plog = 10 ** (P / 10.0)
|
| 1271 |
+
Ptotal = np.sum(Plog)
|
| 1272 |
+
if Ptotal > 0:
|
| 1273 |
+
Ptotal = 10 * np.log10(Ptotal)
|
| 1274 |
+
else:
|
| 1275 |
+
Ptotal = -1.0 * np.inf
|
| 1276 |
+
|
| 1277 |
+
return Ptotal, P, F
|
| 1278 |
+
|
| 1279 |
+
|
| 1280 |
+
def specific_loudness(x, Pref, fs, Mod):
|
| 1281 |
+
"""
|
| 1282 |
+
Calculates loudness in 3rd octave bands
|
| 1283 |
+
based on ISO 532 B / DIN 45631
|
| 1284 |
+
Source: BASIC code in J Acoust Soc Jpn(E) 12, 1(1991)
|
| 1285 |
+
x = signal
|
| 1286 |
+
Pref = refernce value[dB]
|
| 1287 |
+
fs = sampling frequency[Hz]
|
| 1288 |
+
Mod = 0 for free field
|
| 1289 |
+
Mod = 1 for diffuse field
|
| 1290 |
+
|
| 1291 |
+
Returns
|
| 1292 |
+
N_entire = entire loudness[sone]
|
| 1293 |
+
N_single = partial loudness[sone / Bark]
|
| 1294 |
+
|
| 1295 |
+
Original Matlab code by Claire Churchill Jun. 2004
|
| 1296 |
+
Transcoded by Andy Pearce 2018
|
| 1297 |
+
"""
|
| 1298 |
+
|
| 1299 |
+
# 'Generally used third-octave band filters show a leakage towards neighbouring filters of about -20 dB. This
|
| 1300 |
+
# means that a 70dB, 1 - kHz tone produces the following levels at different centre
|
| 1301 |
+
# frequencies: 10dB at 500Hz, 30dB at 630Hz, 50dB at 800Hz and 70dB at 1kHz.
|
| 1302 |
+
# P211 Psychoacoustics: Facts and Models, E.Zwicker and H.Fastl
|
| 1303 |
+
# (A filter order of 4 gives approx this result)
|
| 1304 |
+
|
| 1305 |
+
# set default
|
| 1306 |
+
Fmin = 25
|
| 1307 |
+
Fmax = 12500
|
| 1308 |
+
order = 4
|
| 1309 |
+
# filter the audio
|
| 1310 |
+
Ptotal, P, F = filter_third_octaves_downsample(x, Pref, fs, Fmin, Fmax, order);
|
| 1311 |
+
|
| 1312 |
+
|
| 1313 |
+
# set more defaults for perceptual filters
|
| 1314 |
+
|
| 1315 |
+
# Centre frequencies of 1 / 3 Oct bands(FR)
|
| 1316 |
+
FR = np.array([25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600,
|
| 1317 |
+
2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500])
|
| 1318 |
+
|
| 1319 |
+
# Ranges of 1 / 3 Oct bands for correction at low frequencies according to equal loudness contours
|
| 1320 |
+
RAP = np.array([45, 55, 65, 71, 80, 90, 100, 120])
|
| 1321 |
+
|
| 1322 |
+
# Reduction of 1/3 Oct Band levels at low frequencies according to equal loudness contours
|
| 1323 |
+
# within the eight ranges defined by RAP(DLL)
|
| 1324 |
+
DLL = np.array([[-32, -24, -16, -10, -5, 0, -7, -3, 0, -2, 0],
|
| 1325 |
+
[-29, -22, -15, -10, -4, 0, -7, -2, 0, -2, 0],
|
| 1326 |
+
[-27, -19, -14, -9, -4, 0, -6, -2, 0, -2, 0],
|
| 1327 |
+
[-25, -17, -12, -9, -3, 0, -5, -2, 0, -2, 0],
|
| 1328 |
+
[-23, -16, -11, -7, -3, 0, -4, -1, 0, -1, 0],
|
| 1329 |
+
[-20, -14, -10, -6, -3, 0, -4, -1, 0, -1, 0],
|
| 1330 |
+
[-18, -12, -9, -6, -2, 0, -3, -1, 0, -1, 0],
|
| 1331 |
+
[-15, -10, -8, -4, -2, 0, -3, -1, 0, -1, 0]])
|
| 1332 |
+
|
| 1333 |
+
# Critical band level at absolute threshold without taking into account the
|
| 1334 |
+
# transmission characteristics of the ear
|
| 1335 |
+
LTQ = np.array([30, 18, 12, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]) # Threshold due to internal noise
|
| 1336 |
+
# Hearing thresholds for the excitation levels (each number corresponds to a critical band 12.5kHz is not included)
|
| 1337 |
+
|
| 1338 |
+
# Attenuation representing transmission between freefield and our hearing system
|
| 1339 |
+
A0 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5, -1.6, -3.2, -5.4, -5.6, -4, -1.5, 2, 5, 12])
|
| 1340 |
+
# Attenuation due to transmission in the middle ear
|
| 1341 |
+
# Moore et al disagrees with this being flat for low frequencies
|
| 1342 |
+
|
| 1343 |
+
# Level correction to convert from a free field to a diffuse field(last critical band 12.5 kHz is not included)
|
| 1344 |
+
DDF = np.array([0, 0, 0.5, 0.9, 1.2, 1.6, 2.3, 2.8, 3, 2, 0, -1.4, -2, -1.9, -1, 0.5, 3, 4, 4.3, 4])
|
| 1345 |
+
|
| 1346 |
+
# Correction factor because using third octave band levels(rather than critical bands)
|
| 1347 |
+
DCB = np.array([-0.25, -0.6, -0.8, -0.8, -0.5, 0, 0.5, 1.1, 1.5, 1.7, 1.8, 1.8, 1.7, 1.6, 1.4, 1.2, 0.8,
|
| 1348 |
+
0.5, 0, -0.5])
|
| 1349 |
+
|
| 1350 |
+
# Upper limits of the approximated critical bands
|
| 1351 |
+
ZUP = np.array([0.9, 1.8, 2.8, 3.5, 4.4, 5.4, 6.6, 7.9, 9.2, 10.6, 12.3, 13.8, 15.2, 16.7, 18.1, 19.3, 20.6, 21.8,
|
| 1352 |
+
22.7, 23.6, 24])
|
| 1353 |
+
|
| 1354 |
+
# Range of specific loudness for the determination of the steepness of the upper slopes in the specific loudness
|
| 1355 |
+
# - critical band rate pattern(used to plot the correct USL curve)
|
| 1356 |
+
RNS = np.array([21.5, 18, 15.1, 11.5, 9, 6.1, 4.4, 3.1, 2.13, 1.36, 0.82, 0.42, 0.30, 0.22, 0.15, 0.10, 0.035, 0])
|
| 1357 |
+
|
| 1358 |
+
# This is used to design the right hand slope of the loudness
|
| 1359 |
+
USL = np.array([[13.0, 8.2, 6.3, 5.5, 5.5, 5.5, 5.5, 5.5],
|
| 1360 |
+
[9.0, 7.5, 6.0, 5.1, 4.5, 4.5, 4.5, 4.5],
|
| 1361 |
+
[7.8, 6.7, 5.6, 4.9, 4.4, 3.9, 3.9, 3.9],
|
| 1362 |
+
[6.2, 5.4, 4.6, 4.0, 3.5, 3.2, 3.2, 3.2],
|
| 1363 |
+
[4.5, 3.8, 3.6, 3.2, 2.9, 2.7, 2.7, 2.7],
|
| 1364 |
+
[3.7, 3.0, 2.8, 2.35, 2.2, 2.2, 2.2, 2.2],
|
| 1365 |
+
[2.9, 2.3, 2.1, 1.9, 1.8, 1.7, 1.7, 1.7],
|
| 1366 |
+
[2.4, 1.7, 1.5, 1.35, 1.3, 1.3, 1.3, 1.3],
|
| 1367 |
+
[1.95, 1.45, 1.3, 1.15, 1.1, 1.1, 1.1, 1.1],
|
| 1368 |
+
[1.5, 1.2, 0.94, 0.86, 0.82, 0.82, 0.82, 0.82],
|
| 1369 |
+
[0.72, 0.67, 0.64, 0.63, 0.62, 0.62, 0.62, 0.62],
|
| 1370 |
+
[0.59, 0.53, 0.51, 0.50, 0.42, 0.42, 0.42, 0.42],
|
| 1371 |
+
[0.40, 0.33, 0.26, 0.24, 0.24, 0.22, 0.22, 0.22],
|
| 1372 |
+
[0.27, 0.21, 0.20, 0.18, 0.17, 0.17, 0.17, 0.17],
|
| 1373 |
+
[0.16, 0.15, 0.14, 0.12, 0.11, 0.11, 0.11, 0.11],
|
| 1374 |
+
[0.12, 0.11, 0.10, 0.08, 0.08, 0.08, 0.08, 0.08],
|
| 1375 |
+
[0.09, 0.08, 0.07, 0.06, 0.06, 0.06, 0.06, 0.05],
|
| 1376 |
+
[0.06, 0.05, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02]])
|
| 1377 |
+
|
| 1378 |
+
# apply weighting factors
|
| 1379 |
+
Xp = np.zeros(11)
|
| 1380 |
+
Ti = np.zeros(11)
|
| 1381 |
+
for i in range(11):
|
| 1382 |
+
j = 0
|
| 1383 |
+
while (P[i] > (RAP[j] - DLL[j, i])) & (j < 7):
|
| 1384 |
+
j += 1
|
| 1385 |
+
Xp[i] = P[i] + DLL[j, i]
|
| 1386 |
+
Ti[i] = 10.0 ** (Xp[i] / 10.0)
|
| 1387 |
+
|
| 1388 |
+
# Intensity values in first three critical bands calculated
|
| 1389 |
+
Gi = np.zeros(3)
|
| 1390 |
+
Gi[0] = np.sum(Ti[0:6]) # Gi(1) is the first critical band (sum of two octaves(25Hz to 80Hz))
|
| 1391 |
+
Gi[1] = np.sum(Ti[6:9]) # Gi(2) is the second critical band (sum of octave(100Hz to 160Hz))
|
| 1392 |
+
Gi[2] = np.sum(Ti[9:11]) # Gi(3) is the third critical band (sum of two third octave bands(200Hz to 250Hz))
|
| 1393 |
+
|
| 1394 |
+
if np.max(Gi) > 0.0:
|
| 1395 |
+
FNGi = 10 * np.log10(Gi)
|
| 1396 |
+
else:
|
| 1397 |
+
FNGi = -1.0 * np.inf
|
| 1398 |
+
LCB = np.zeros_like(Gi)
|
| 1399 |
+
for i in range(3):
|
| 1400 |
+
if Gi[i] > 0:
|
| 1401 |
+
LCB[i] = FNGi[i]
|
| 1402 |
+
else:
|
| 1403 |
+
LCB[i] = 0
|
| 1404 |
+
|
| 1405 |
+
# Calculate the main loudness in each critical band
|
| 1406 |
+
Le = np.ones(20)
|
| 1407 |
+
Lk = np.ones_like(Le)
|
| 1408 |
+
Nm = np.ones(21)
|
| 1409 |
+
for i in range(20):
|
| 1410 |
+
Le[i] = P[i+8]
|
| 1411 |
+
if i <= 2:
|
| 1412 |
+
Le[i] = LCB[i]
|
| 1413 |
+
Lk[i] = Le[i] - A0[i]
|
| 1414 |
+
Nm[i] = 0
|
| 1415 |
+
if Mod == 1:
|
| 1416 |
+
Le[i] = Le[i] + DDF[i]
|
| 1417 |
+
if Le[i] > LTQ[i]:
|
| 1418 |
+
Le[i] = Lk[i] - DCB[i]
|
| 1419 |
+
S = 0.25
|
| 1420 |
+
MP1 = 0.0635 * 10.0 ** (0.025 * LTQ[i])
|
| 1421 |
+
MP2 = (1 - S + S * 10 ** (0.1 * (Le[i] - LTQ[i]))) ** 0.25 - 1
|
| 1422 |
+
Nm[i] = MP1 * MP2;
|
| 1423 |
+
if Nm[i] <= 0:
|
| 1424 |
+
Nm[i] = 0
|
| 1425 |
+
Nm[20] = 0
|
| 1426 |
+
|
| 1427 |
+
KORRY = 0.4 + 0.32 * Nm[0] ** 0.2
|
| 1428 |
+
if KORRY > 1:
|
| 1429 |
+
KORRY = 1
|
| 1430 |
+
|
| 1431 |
+
Nm[0] = Nm[0] * KORRY
|
| 1432 |
+
|
| 1433 |
+
# Add masking curves to the main loudness in each third octave band
|
| 1434 |
+
N = 0
|
| 1435 |
+
z1 = 0 # critical band rate starts at 0
|
| 1436 |
+
n1 = 0 # loudness level starts at 0
|
| 1437 |
+
j = 17
|
| 1438 |
+
iz = 0
|
| 1439 |
+
z = 0.1
|
| 1440 |
+
ns = []
|
| 1441 |
+
|
| 1442 |
+
for i in range(21):
|
| 1443 |
+
# Determines where to start on the slope
|
| 1444 |
+
ig = i-1
|
| 1445 |
+
if ig > 7:
|
| 1446 |
+
ig = 7
|
| 1447 |
+
control = 1
|
| 1448 |
+
while (z1 < ZUP[i]) | (control == 1): # ZUP is the upper limit of the approximated critical band
|
| 1449 |
+
# Determines which of the slopes to use
|
| 1450 |
+
if n1 < Nm[i]: # Nm is the main loudness level
|
| 1451 |
+
j = 0
|
| 1452 |
+
while RNS[j] > Nm[i]: # the value of j is used below to build a slope
|
| 1453 |
+
j += 1 # j becomes the index at which Nm(i) is first greater than RNS
|
| 1454 |
+
|
| 1455 |
+
# The flat portions of the loudness graph
|
| 1456 |
+
if n1 <= Nm[i]:
|
| 1457 |
+
z2 = ZUP[i] # z2 becomes the upper limit of the critical band
|
| 1458 |
+
n2 = Nm[i]
|
| 1459 |
+
N = N + n2 * (z2 - z1) # Sums the output(N_entire)
|
| 1460 |
+
for k in np.arange(z, z2+0.01, 0.1):
|
| 1461 |
+
if not ns:
|
| 1462 |
+
ns.append(n2)
|
| 1463 |
+
else:
|
| 1464 |
+
if iz == len(ns):
|
| 1465 |
+
ns.append(n2)
|
| 1466 |
+
elif iz < len(ns):
|
| 1467 |
+
ns[iz] = n2
|
| 1468 |
+
|
| 1469 |
+
if k < (z2 - 0.05):
|
| 1470 |
+
iz += 1
|
| 1471 |
+
z = k # z becomes the last value of k
|
| 1472 |
+
z = round(z * 10) * 0.1
|
| 1473 |
+
|
| 1474 |
+
# The sloped portions of the loudness graph
|
| 1475 |
+
if n1 > Nm[i]:
|
| 1476 |
+
n2 = RNS[j]
|
| 1477 |
+
if n2 < Nm[i]:
|
| 1478 |
+
n2 = Nm[i]
|
| 1479 |
+
dz = (n1 - n2) / USL[j, ig] # USL = slopes
|
| 1480 |
+
dz = round(dz * 10) * 0.1
|
| 1481 |
+
if dz == 0:
|
| 1482 |
+
dz = 0.1
|
| 1483 |
+
z2 = z1 + dz
|
| 1484 |
+
if z2 > ZUP[i]:
|
| 1485 |
+
z2 = ZUP[i]
|
| 1486 |
+
dz = z2 - z1
|
| 1487 |
+
n2 = n1 - dz * USL[j, ig] # USL = slopes
|
| 1488 |
+
N = N + dz * (n1 + n2) / 2.0 # Sums the output(N_entire)
|
| 1489 |
+
for k in np.arange(z, z2+0.01, 0.1):
|
| 1490 |
+
if not ns:
|
| 1491 |
+
ns.append(n1 - (k - z1) * USL[j, ig])
|
| 1492 |
+
else:
|
| 1493 |
+
if iz == len(ns):
|
| 1494 |
+
ns.append(n1 - (k - z1) * USL[j, ig])
|
| 1495 |
+
elif iz < len(ns):
|
| 1496 |
+
ns[iz] = n1 - (k - z1) * USL[j, ig]
|
| 1497 |
+
if k < (z2 - 0.05):
|
| 1498 |
+
iz += 1
|
| 1499 |
+
z = k
|
| 1500 |
+
z = round(z * 10) * 0.1
|
| 1501 |
+
if n2 == RNS[j]:
|
| 1502 |
+
j += 1
|
| 1503 |
+
if j > 17:
|
| 1504 |
+
j = 17
|
| 1505 |
+
n1 = n2
|
| 1506 |
+
z1 = z2
|
| 1507 |
+
z1 = round(z1 * 10) * 0.1
|
| 1508 |
+
control += 1
|
| 1509 |
+
|
| 1510 |
+
if N < 0:
|
| 1511 |
+
N = 0
|
| 1512 |
+
|
| 1513 |
+
if N <= 16:
|
| 1514 |
+
N = np.floor(N * 1000 + 0.5) / 1000.0
|
| 1515 |
+
else:
|
| 1516 |
+
N = np.floor(N * 100 + .05) / 100.0
|
| 1517 |
+
|
| 1518 |
+
LN = 40.0 * (N + 0.0005) ** 0.35
|
| 1519 |
+
|
| 1520 |
+
if LN < 3:
|
| 1521 |
+
LN = 3
|
| 1522 |
+
|
| 1523 |
+
if N >= 1:
|
| 1524 |
+
LN = 10 * np.log10(N) / np.log10(2) + 40;
|
| 1525 |
+
|
| 1526 |
+
N_single = np.zeros(240)
|
| 1527 |
+
for i in range(240):
|
| 1528 |
+
N_single[i] = ns[i]
|
| 1529 |
+
|
| 1530 |
+
N_entire = N
|
| 1531 |
+
return N_entire, N_single
|
| 1532 |
+
|
| 1533 |
+
|
| 1534 |
+
def output_clip(score, min_score=0, max_score=100):
|
| 1535 |
+
"""
|
| 1536 |
+
Limits the output of the score between min_score and max_score
|
| 1537 |
+
|
| 1538 |
+
:param score:
|
| 1539 |
+
:param min_score:
|
| 1540 |
+
:param max_score:
|
| 1541 |
+
:return:
|
| 1542 |
+
"""
|
| 1543 |
+
if score < min_score:
|
| 1544 |
+
return 0.0
|
| 1545 |
+
elif score > max_score:
|
| 1546 |
+
return 100.0
|
| 1547 |
+
else:
|
| 1548 |
+
return score
|
| 1549 |
+
|
| 1550 |
+
|
| 1551 |
+
def fast_hilbert(array, use_matlab_hilbert=False):
|
| 1552 |
+
"""
|
| 1553 |
+
Calculates the hilbert transform of the array by segmenting signal first to speed up calculation.
|
| 1554 |
+
:param array:
|
| 1555 |
+
:return:
|
| 1556 |
+
"""
|
| 1557 |
+
step_size = 32768
|
| 1558 |
+
overlap = 2
|
| 1559 |
+
overlap_size = int(step_size/(2*overlap))
|
| 1560 |
+
# how many steps, rounded to nearest int
|
| 1561 |
+
# step_no = int((len(array) / (step_size - overlap)) + 0.5)
|
| 1562 |
+
step_start = 0
|
| 1563 |
+
hold_hilbert = np.array([])
|
| 1564 |
+
while (step_start + step_size) < len(array):
|
| 1565 |
+
hold_array = array[step_start:step_start+step_size]
|
| 1566 |
+
if use_matlab_hilbert:
|
| 1567 |
+
this_hilbert = np.abs(matlab_hilbert(hold_array))
|
| 1568 |
+
else:
|
| 1569 |
+
this_hilbert = np.abs(scipy.signal.hilbert(hold_array))
|
| 1570 |
+
|
| 1571 |
+
if step_start == 0:
|
| 1572 |
+
# try to concatonate the results
|
| 1573 |
+
hold_hilbert = np.concatenate((hold_hilbert,this_hilbert[:3*overlap_size]))
|
| 1574 |
+
else:
|
| 1575 |
+
hold_hilbert = np.concatenate((hold_hilbert, this_hilbert[overlap_size:3*overlap_size]))
|
| 1576 |
+
|
| 1577 |
+
# increment the step
|
| 1578 |
+
step_start += int(step_size/overlap)
|
| 1579 |
+
|
| 1580 |
+
# do the last step
|
| 1581 |
+
hold_array = array[step_start:]
|
| 1582 |
+
this_hilbert = np.abs(scipy.signal.hilbert(hold_array))
|
| 1583 |
+
|
| 1584 |
+
# try to concatonate the results
|
| 1585 |
+
hold_hilbert = np.concatenate((hold_hilbert, this_hilbert[overlap_size:]))
|
| 1586 |
+
return hold_hilbert
|
| 1587 |
+
|
| 1588 |
+
|
| 1589 |
+
def fast_hilbert_spectrum(array, use_matlab_hilbert=False):
|
| 1590 |
+
"""
|
| 1591 |
+
Calculates the hilbert transform of the array by segmenting signal first to speed up calculation.
|
| 1592 |
+
:param array:
|
| 1593 |
+
:return:
|
| 1594 |
+
"""
|
| 1595 |
+
step_size = 32768
|
| 1596 |
+
overlap = 2
|
| 1597 |
+
overlap_size = int(step_size/(2*overlap))
|
| 1598 |
+
step_start = 0
|
| 1599 |
+
hold_HILBERT = []
|
| 1600 |
+
if (step_start + step_size) < len(array):
|
| 1601 |
+
while (step_start + step_size) < len(array):
|
| 1602 |
+
hold_array = array[step_start:step_start+step_size]
|
| 1603 |
+
if use_matlab_hilbert:
|
| 1604 |
+
this_hilbert = np.abs(matlab_hilbert(hold_array))
|
| 1605 |
+
else:
|
| 1606 |
+
this_hilbert = np.abs(scipy.signal.hilbert(hold_array))
|
| 1607 |
+
|
| 1608 |
+
HILBERT = np.abs(np.fft.fft(np.abs(this_hilbert)))
|
| 1609 |
+
HILBERT = HILBERT[0:int(len(HILBERT) / 2.0)] # take the real part
|
| 1610 |
+
hold_HILBERT.append(HILBERT)
|
| 1611 |
+
|
| 1612 |
+
step_start += int(step_size/overlap)
|
| 1613 |
+
|
| 1614 |
+
# hilbert_spectrum = np.sum(hold_HILBERT, axis=0)
|
| 1615 |
+
hilbert_spectrum = np.mean(hold_HILBERT, axis=0)
|
| 1616 |
+
|
| 1617 |
+
else:
|
| 1618 |
+
# how much to pad by
|
| 1619 |
+
array = np.pad(array, (0, step_size - len(array)), 'constant', constant_values=0.0)
|
| 1620 |
+
|
| 1621 |
+
if use_matlab_hilbert:
|
| 1622 |
+
this_hilbert = np.abs(matlab_hilbert(array))
|
| 1623 |
+
else:
|
| 1624 |
+
this_hilbert = np.abs(scipy.signal.hilbert(array))
|
| 1625 |
+
|
| 1626 |
+
HILBERT = np.abs(np.fft.fft(np.abs(this_hilbert)))
|
| 1627 |
+
HILBERT = HILBERT[0:int(len(HILBERT) / 2.0)] # take the real part
|
| 1628 |
+
|
| 1629 |
+
hilbert_spectrum = HILBERT
|
| 1630 |
+
|
| 1631 |
+
return hilbert_spectrum
|
| 1632 |
+
|
| 1633 |
+
|
| 1634 |
+
def matlab_hilbert(signal):
|
| 1635 |
+
'''
|
| 1636 |
+
Define a method for calculating the hilbert transform of a 1D array using the method from Matlab
|
| 1637 |
+
|
| 1638 |
+
:param signal:
|
| 1639 |
+
:return:
|
| 1640 |
+
'''
|
| 1641 |
+
# get the fft
|
| 1642 |
+
n = len(signal)
|
| 1643 |
+
x = np.fft.fft(signal)
|
| 1644 |
+
h = np.zeros(n)
|
| 1645 |
+
|
| 1646 |
+
if (n>0) and (~isodd(n)):
|
| 1647 |
+
# even and nonempty
|
| 1648 |
+
h[0] = 1
|
| 1649 |
+
h[int(n/2)] = 1
|
| 1650 |
+
h[1:int(n/2)] = 2
|
| 1651 |
+
elif n>0:
|
| 1652 |
+
# odd and nonempty
|
| 1653 |
+
h[0] = 1
|
| 1654 |
+
h[1:int((n+1)/2.0)] = 2
|
| 1655 |
+
|
| 1656 |
+
# this is the hilbert bit
|
| 1657 |
+
x = np.fft.ifft(x * h)
|
| 1658 |
+
|
| 1659 |
+
return x
|
| 1660 |
+
|
| 1661 |
+
|
| 1662 |
+
|
| 1663 |
+
def isodd(num):
|
| 1664 |
+
return num & 0x1
|
| 1665 |
+
|
| 1666 |
+
|
| 1667 |
+
def window_audio(audio_samples, window_length=4096):
|
| 1668 |
+
"""
|
| 1669 |
+
Segment the audio samples into a numpy array the correct size and shape, so that each row is a new window of audio
|
| 1670 |
+
:param audio_samples:
|
| 1671 |
+
:param window_length:
|
| 1672 |
+
:param overlap:
|
| 1673 |
+
:return:
|
| 1674 |
+
"""
|
| 1675 |
+
remainder = np.mod(len(audio_samples), window_length) # how many samples are left after division
|
| 1676 |
+
|
| 1677 |
+
#zero pad audio samples
|
| 1678 |
+
audio_samples = np.pad(audio_samples, (0, int(window_length-remainder)), 'constant', constant_values=0.0)
|
| 1679 |
+
windowed_samples = np.reshape(audio_samples, (int(len(audio_samples) / window_length), int(window_length)))
|
| 1680 |
+
|
| 1681 |
+
return windowed_samples
|
| 1682 |
+
|
| 1683 |
+
|
| 1684 |
+
def normal_dist(array, theta=1.0, mean=0.0):
|
| 1685 |
+
y = (1.0 / (theta * np.sqrt(2.0 * np.pi))) * np.exp((-1.0 * ((array - mean)**2.0)) / 2.0 * (theta ** 2.0))
|
| 1686 |
+
return y
|
| 1687 |
+
|
| 1688 |
+
|
| 1689 |
+
def weighted_bark_level(audio_samples, fs, low_bark_band=0, upper_bark_band=240):
|
| 1690 |
+
#window the audio
|
| 1691 |
+
windowed_samples = window_audio(audio_samples)
|
| 1692 |
+
|
| 1693 |
+
# need to define a function for the roughness stimuli, emphasising the 20 - 40 region (of the bark scale)
|
| 1694 |
+
mean_bark_band = (low_bark_band + upper_bark_band) / 2.0
|
| 1695 |
+
array = np.arange(low_bark_band, upper_bark_band)
|
| 1696 |
+
x = normal_dist(array, theta=0.01, mean=mean_bark_band)
|
| 1697 |
+
x -= np.min(x)
|
| 1698 |
+
x /= np.max(x)
|
| 1699 |
+
|
| 1700 |
+
weight_array = np.zeros(240)
|
| 1701 |
+
weight_array[low_bark_band:upper_bark_band] = x
|
| 1702 |
+
|
| 1703 |
+
windowed_loud_spec = []
|
| 1704 |
+
windowed_rms = []
|
| 1705 |
+
weighted_vals = []
|
| 1706 |
+
|
| 1707 |
+
for i in range(windowed_samples.shape[0]):
|
| 1708 |
+
samples = windowed_samples[i, :]
|
| 1709 |
+
N_entire, N_single = specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
|
| 1710 |
+
|
| 1711 |
+
# append the loudness spec
|
| 1712 |
+
windowed_loud_spec.append(N_single)
|
| 1713 |
+
windowed_rms.append(np.sqrt(np.mean(samples * samples)))
|
| 1714 |
+
weighted_vals.append(np.sum(weight_array * N_single))
|
| 1715 |
+
|
| 1716 |
+
mean_weight = np.mean(weighted_vals)
|
| 1717 |
+
weighted_weight = np.average(weighted_vals, weights=windowed_rms)
|
| 1718 |
+
|
| 1719 |
+
return mean_weight, weighted_weight
|
| 1720 |
+
|
| 1721 |
+
|
| 1722 |
+
|
| 1723 |
+
'''
|
| 1724 |
+
Loudnorm function to be included in future update
|
| 1725 |
+
'''
|
| 1726 |
+
def loud_norm(audio, fs=44100, target_loudness=-24.0):
|
| 1727 |
+
'''
|
| 1728 |
+
Takes in audio data and returns the same audio loudness normalised
|
| 1729 |
+
:param audio:
|
| 1730 |
+
:param fs:
|
| 1731 |
+
:param target_loudness:
|
| 1732 |
+
:return:
|
| 1733 |
+
'''
|
| 1734 |
+
meter = pyln.Meter(fs)
|
| 1735 |
+
|
| 1736 |
+
# minimum length of file is 0.4 seconds
|
| 1737 |
+
if len(audio) < (fs * 0.4):
|
| 1738 |
+
# how much longer does the file need to be?
|
| 1739 |
+
samples_needed = int(fs * 0.4) - len(audio)
|
| 1740 |
+
|
| 1741 |
+
# zero pad signal
|
| 1742 |
+
len_check_audio = np.pad(audio, (0, samples_needed), 'constant', constant_values=0.0)
|
| 1743 |
+
else:
|
| 1744 |
+
len_check_audio = audio
|
| 1745 |
+
|
| 1746 |
+
# assess the current loudness
|
| 1747 |
+
current_loudness = meter.integrated_loudness(len_check_audio)
|
| 1748 |
+
normalised_audio = pyln.normalize.loudness(audio, current_loudness, target_loudness)
|
| 1749 |
+
|
| 1750 |
+
# check for clipping and reduce level
|
| 1751 |
+
if np.max(np.abs(normalised_audio)) > 1.0:
|
| 1752 |
+
normalised_audio /= np.max(np.abs(normalised_audio))
|
| 1753 |
+
|
| 1754 |
+
return normalised_audio
|
| 1755 |
+
|
| 1756 |
+
|
| 1757 |
+
|
| 1758 |
+
|
| 1759 |
+
|
| 1760 |
+
def file_read(fname, fs=0, phase_correction=False, mono_sum=True, loudnorm=True, resample_low_fs=True):
|
| 1761 |
+
"""
|
| 1762 |
+
Read in audio file, but check if it's already an array
|
| 1763 |
+
Return samples if already an array.
|
| 1764 |
+
:param fname:
|
| 1765 |
+
:return:
|
| 1766 |
+
"""
|
| 1767 |
+
if isinstance(fname, six.string_types):
|
| 1768 |
+
# use pysoundfile to read audio
|
| 1769 |
+
audio_samples, fs = sf.read(fname, always_2d=False)
|
| 1770 |
+
|
| 1771 |
+
elif hasattr(fname, 'shape'):
|
| 1772 |
+
if fs==0:
|
| 1773 |
+
raise ValueError('If giving function an array, \'fs\' must be specified')
|
| 1774 |
+
audio_samples = fname
|
| 1775 |
+
|
| 1776 |
+
else:
|
| 1777 |
+
raise TypeError('Input type of \'fname\' must be string, or have a shape attribute (e.g. a numpy array)')
|
| 1778 |
+
|
| 1779 |
+
# check audio file contains data
|
| 1780 |
+
if audio_samples.size==0:
|
| 1781 |
+
raise ValueError('Input audio file does not contain data')
|
| 1782 |
+
|
| 1783 |
+
# reduce to mono
|
| 1784 |
+
if mono_sum:
|
| 1785 |
+
audio_samples = channel_reduction(audio_samples, phase_correction)
|
| 1786 |
+
|
| 1787 |
+
# check data has values
|
| 1788 |
+
if np.max(np.abs(audio_samples)) == 0.0:
|
| 1789 |
+
raise ValueError('Input file is silence, cannot be analysed.')
|
| 1790 |
+
|
| 1791 |
+
# loudness normalise
|
| 1792 |
+
if loudnorm:
|
| 1793 |
+
audio_samples = loud_norm(audio_samples, fs, target_loudness=-24.0)
|
| 1794 |
+
|
| 1795 |
+
if resample_low_fs:
|
| 1796 |
+
# check if upsampling required and perform to avoid errors
|
| 1797 |
+
audio_samples, fs = check_upsampling(audio_samples, fs)
|
| 1798 |
+
|
| 1799 |
+
return audio_samples, fs
|
| 1800 |
+
|
| 1801 |
+
|
| 1802 |
+
|
| 1803 |
+
def check_upsampling(audio_samples, fs, lowest_fs=44100):
|
| 1804 |
+
"""
|
| 1805 |
+
Check if upsampling needfs to be applied, then perform it if necessary
|
| 1806 |
+
|
| 1807 |
+
:param audio_samples:
|
| 1808 |
+
:param fs:
|
| 1809 |
+
:return:
|
| 1810 |
+
"""
|
| 1811 |
+
if fs < lowest_fs:
|
| 1812 |
+
# upsample file to avoid errors when calculating specific loudness
|
| 1813 |
+
audio_samples = librosa.core.resample(audio_samples, fs, lowest_fs)
|
| 1814 |
+
fs = lowest_fs
|
| 1815 |
+
|
| 1816 |
+
return audio_samples, fs
|