avoice-dev commited on
Commit
7926ab4
·
1 Parent(s): 58995dc

fix(common): deploy timbral models analyzer

Browse files
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from timbral_models.Timbral_Extractor import timbral_extractor
3
+
4
+ def main_timbre_iosr_analysis(in_files):
5
+ if type(in_files) is not list:
6
+ in_files = [in_files]
7
+ output_text = ''
8
+
9
+ for file_path in in_files:
10
+ if file_path:
11
+ timbre = timbral_extractor(file_path)
12
+ output_text +=f'----- Комплексные характеристики тембра ----- \n'
13
+ output_text +=f'1. Глубина, depth (%): {round(timbre.get("depth", ""),2)} \n'
14
+ output_text +=f'2. Яркость, brightness (%): {round(timbre.get("brightness", ""),2)} \n'
15
+ output_text +=f'3. Теплота, warmth (%): {round(timbre.get("warmth", ""),2)} \n'
16
+ output_text +=f'4. Жесткость, hardness (%): {round(timbre.get("hardness", ""),2)} \n'
17
+ output_text +=f'5. Резкость, sharpness (%): {round(timbre.get("sharpness", ""),2)} \n'
18
+ output_text +=f'6. Шершавость, roughness (%): {round(timbre.get("roughness", ""),2)} \n'
19
+ output_text +=f'7. Гулкость, boominess (%): {round(timbre.get("boominess", ""),2)} \n'
20
+ output_text +=f'8. Реверберация, reverb (0-1): {timbre.get("reverb", "")} \n'
21
+
22
+ return output_text
23
+
24
+ def timbre_iosr_analysis(in_files):
25
+ output_text = main_timbre_iosr_analysis(in_files)
26
+ return output_text
27
+
28
+ iface = gr.Interface(
29
+ fn = timbre_iosr_analysis,
30
+ inputs=[
31
+ gr.Audio(type="filepath", label="Загрузить аудио файл"),
32
+ ],
33
+ outputs=[
34
+ gr.Textbox(label="Результаты"),
35
+ ],
36
+ title = "Анализатор \"Характеристики тембра\"",
37
+ description = "Выбирите аудиофайл для анализа, дождитесь его загрузки в окне прослушивания (слева). Затем нажмите кнопку \"Запустить\". \n Дождитесь появление результатов в окне вывода (справа).",
38
+ submit_btn = "Запустить",
39
+ clear_btn = "Очистить",
40
+ allow_flagging="never"
41
+ )
42
+
43
+ if __name__ == "__main__":
44
+ iface.launch(debug=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy
2
+ soundfile
3
+ librosa
4
+ scipy
5
+ scikit-learn
6
+ six
7
+ pyloudnorm
timbral_models/Timbral_Booming.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from . import timbral_util
5
+
6
+
7
+ def boominess_calculate(loudspec):
8
+ """
9
+ Calculates the Booming Index as described by Hatano, S., and Hashimoto, T. "Booming index as a measure for
10
+ evaluating booming sensation", The 29th International congress and Exhibition on Noise Control Engineering, 2000.
11
+ """
12
+
13
+ # loudspec from the loudness_1991 code results in values from 0.1 to 24 Bark in 0.1 steps
14
+ z = np.arange(0.1, 24.05, 0.1) #0.1 to 24 bark in 0.1 steps
15
+ f = 600 * np.sinh(z / 6.0) # convert these bark values to frequency
16
+ FR = [25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, 2000, 2500,
17
+ 3150, 4000, 5000, 6300, 8000, 10000, 12500] # get the centre frequencies of 3rd octave bands
18
+
19
+ # now I need to convert f onto the FR scale
20
+ logFR = np.log10(FR)
21
+ FR_step = logFR[1] - logFR[0] # get the step size on the log scale
22
+ FR_min = logFR[0] # get the minimum value of the logFR
23
+
24
+ logf = np.log10(f) # get the log version of estimated frequencies
25
+ # estimate the indexes of the bark scale on the 3rd octave scale
26
+ estimated_index = ((logf - FR_min) / float(FR_step)) + 1
27
+
28
+ # weighting function based from the estimated indexes
29
+ Weighting_function = 2.13 * np.exp(-0.151 * estimated_index)
30
+
31
+ # change the LF indexes to roll off
32
+ Weighting_function[0] = 0.8 # this value is estimated
33
+ Weighting_function[1] = 1.05
34
+ Weighting_function[2] = 1.10
35
+ Weighting_function[3] = 1.18
36
+
37
+ # identify index where frequency is less than 280Hz
38
+ below_280_idx = np.where(f >= 280)[0][0]
39
+
40
+ I = loudspec * Weighting_function
41
+ loudness = np.sum(loudspec)
42
+ Ll = np.sum(loudspec[:below_280_idx])
43
+
44
+ Bandsum = timbral_util.log_sum(I)
45
+ BoomingIndex = Bandsum * (Ll / loudness)
46
+
47
+ return BoomingIndex
48
+
49
+
50
+ def timbral_booming(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False):
51
+ """
52
+ This is an implementation of the hasimoto booming index feature.
53
+ There are a few fudge factors with the code to convert between the internal representation of the sound using the
54
+ same loudness calculation as the sharpness code. The equation for calculating the booming index is not
55
+ specifically quoted anywhere so I've done the best i can with the code that was presented.
56
+
57
+ Shin, SH, Ih, JG, Hashimoto, T., and Hatano, S.: "Sound quality evaluation of the booming sensation for passenger
58
+ cars", Applied Acoustics, Vol. 70, 2009.
59
+
60
+ Hatano, S., and Hashimoto, T. "Booming index as a measure for
61
+ evaluating booming sensation", The 29th International congress and Exhibition on Noise Control Engineering, 2000.
62
+
63
+ This function calculates the apparent Boominess of an audio file.
64
+
65
+ This version of timbral_booming contains self loudness normalising methods and can accept arrays as an input
66
+ instead of a string filename.
67
+
68
+ Version 0.4
69
+
70
+ Required parameter
71
+ :param fname: string or numpy array
72
+ string, audio filename to be analysed, including full file path and extension.
73
+ numpy array, array of audio samples, requires fs to be set to the sample rate.
74
+
75
+ Optional parameters
76
+ :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
77
+ Defaults to 0.
78
+ :param dev_output: bool, when False return the warmth, when True return all extracted features.
79
+ Defaults to False.
80
+ :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
81
+ Defaults to False.
82
+ :param clip_output: bool, force the output to be between 0 and 100.
83
+
84
+ :return float, apparent boominess of the audio file.
85
+
86
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
87
+
88
+ Licensed under the Apache License, Version 2.0 (the "License");
89
+ you may not use this file except in compliance with the License.
90
+ You may obtain a copy of the License at
91
+
92
+ http://www.apache.org/licenses/LICENSE-2.0
93
+
94
+ Unless required by applicable law or agreed to in writing, software
95
+ distributed under the License is distributed on an "AS IS" BASIS,
96
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
97
+ See the License for the specific language governing permissions and
98
+ limitations under the License.
99
+ """
100
+ '''
101
+ Read input
102
+ '''
103
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
104
+
105
+
106
+ # window the audio file into 4096 sample sections
107
+ windowed_audio = timbral_util.window_audio(audio_samples, window_length=4096)
108
+
109
+ windowed_booming = []
110
+ windowed_rms = []
111
+ for i in range(windowed_audio.shape[0]):
112
+ samples = windowed_audio[i, :] # the current time window
113
+ # get the rms value and append to list
114
+ windowed_rms.append(np.sqrt(np.mean(samples * samples)))
115
+
116
+ # calculate the specific loudness
117
+ N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
118
+
119
+ # calculate the booming index is contains a level
120
+ if N_entire > 0:
121
+ # boom = boominess_calculate(N_single)
122
+ BoomingIndex = boominess_calculate(N_single)
123
+ else:
124
+ BoomingIndex = 0
125
+
126
+ windowed_booming.append(BoomingIndex)
127
+
128
+ # get level of low frequencies
129
+ ll, w_ll = timbral_util.weighted_bark_level(audio_samples, fs, 0, 70)
130
+
131
+ ll = np.log10(ll)
132
+ # convert to numpy arrays for fancy indexing
133
+ windowed_booming = np.array(windowed_booming)
134
+ windowed_rms = np.array(windowed_rms)
135
+
136
+ # get the weighted average
137
+ rms_boom = np.average(windowed_booming, weights=(windowed_rms * windowed_rms))
138
+ rms_boom = np.log10(rms_boom)
139
+
140
+ if dev_output:
141
+ return [rms_boom, ll]
142
+ else:
143
+
144
+ # perform thye linear regression
145
+ all_metrics = np.ones(3)
146
+ all_metrics[0] = rms_boom
147
+ all_metrics[1] = ll
148
+
149
+ coefficients = np.array([43.67402696195865, -10.90054738389845, 26.836530575185435])
150
+
151
+ boominess = np.sum(all_metrics * coefficients)
152
+
153
+ if clip_output:
154
+ boominess = timbral_util.output_clip(boominess)
155
+
156
+ return boominess
timbral_models/Timbral_Brightness.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from . import timbral_util
5
+ from scipy.signal import spectrogram
6
+
7
+
8
+ def timbral_brightness(fname, fs=0, dev_output=False, clip_output=False, phase_correction=False, threshold=0,
9
+ ratio_crossover=2000, centroid_crossover=100, stepSize=1024, blockSize=2048, minFreq=20):
10
+ """
11
+ This function calculates the apparent Brightness of an audio file.
12
+ This version of timbral_brightness contains self loudness normalising methods and can accept arrays as an input
13
+ instead of a string filename.
14
+
15
+ Version 0.4
16
+
17
+ Required parameter
18
+ :param fname: string or numpy array
19
+ string, audio filename to be analysed, including full file path and extension.
20
+ numpy array, array of audio samples, requires fs to be set to the sample rate.
21
+
22
+ Optional parameters
23
+ :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
24
+ Defaults to 0.
25
+ :param dev_output: bool, when False return the brightness, when True return all extracted features.
26
+ :param clip_output: bool, force the output to be between 0 and 100.
27
+ :param phase_correction: bool, Perform phase checking before summing to mono.
28
+ :param threshold: Threshold below which to ignore the energy in a time window, default to 0.
29
+ :param ratio_crossover: Crossover frequency for calculating the HF energy ratio, default to 2000 Hz.
30
+ :param centroid_crossover: Highpass frequency for calculating the spectral centroid, default to 100 Hz.
31
+ :param stepSize: Step size for calculating spectrogram, default to 1024.
32
+ :param blockSize: Block size (fft length) for calculating spectrogram, default to 2048.
33
+ :param minFreq: Frequency for high-pass filtering audio prior to all analysis, default to 20 Hz.
34
+
35
+ :return: Apparent brightness of audio file, float.
36
+
37
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
38
+
39
+ Licensed under the Apache License, Version 2.0 (the "License");
40
+ you may not use this file except in compliance with the License.
41
+ You may obtain a copy of the License at
42
+
43
+ http://www.apache.org/licenses/LICENSE-2.0
44
+
45
+ Unless required by applicable law or agreed to in writing, software
46
+ distributed under the License is distributed on an "AS IS" BASIS,
47
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
48
+ See the License for the specific language governing permissions and
49
+ limitations under the License.
50
+ """
51
+ '''
52
+ Read input
53
+ '''
54
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
55
+
56
+ '''
57
+ Filter audio
58
+ '''
59
+ # highpass audio at minimum frequency
60
+ audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs)
61
+ audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs)
62
+ audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs)
63
+
64
+ # get highpass audio at ratio crossover
65
+ ratio_highpass_audio = timbral_util.filter_audio_highpass(audio_samples, ratio_crossover, fs)
66
+ ratio_highpass_audio = timbral_util.filter_audio_highpass(ratio_highpass_audio, ratio_crossover, fs)
67
+ ratio_highpass_audio = timbral_util.filter_audio_highpass(ratio_highpass_audio, ratio_crossover, fs)
68
+
69
+ # get highpass audio at centroid crossover
70
+ centroid_highpass_audio = timbral_util.filter_audio_highpass(audio_samples, centroid_crossover, fs)
71
+ centroid_highpass_audio = timbral_util.filter_audio_highpass(centroid_highpass_audio, centroid_crossover, fs)
72
+ centroid_highpass_audio = timbral_util.filter_audio_highpass(centroid_highpass_audio, centroid_crossover, fs)
73
+
74
+ '''
75
+ Get spectrograms
76
+ '''
77
+ # normalise audio to the maximum value in the unfiltered audio
78
+ ratio_highpass_audio *= (1.0 / max(abs(audio_samples)))
79
+ centroid_highpass_audio *= (1.0 / max(abs(audio_samples)))
80
+ audio_samples *= (1.0 / max(abs(audio_samples)))
81
+
82
+
83
+ # set FFT parameters
84
+ nfft = blockSize
85
+ hop_size = int(3 * nfft / 4)
86
+
87
+ # check that audio is long enough to generate spectrograms
88
+ if len(audio_samples) >= nfft:
89
+ # get spectrogram
90
+ ratio_all_freq, ratio_all_time, ratio_all_spec = spectrogram(audio_samples, fs, 'hamming', nfft,
91
+ hop_size, nfft, 'constant', True, 'spectrum')
92
+ ratio_hp_freq, ratio_hp_time, ratio_hp_spec = spectrogram(ratio_highpass_audio, fs, 'hamming', nfft,
93
+ hop_size, nfft, 'constant', True, 'spectrum')
94
+ centroid_hp_freq, centroid_hp_time, centroid_hp_spec = spectrogram(centroid_highpass_audio, fs, 'hamming', nfft,
95
+ hop_size, nfft, 'constant', True, 'spectrum')
96
+ else:
97
+ ratio_all_freq, ratio_all_time, ratio_all_spec = spectrogram(audio_samples, fs, 'hamming',
98
+ len(audio_samples),
99
+ len(audio_samples)-1,
100
+ nfft, 'constant', True, 'spectrum')
101
+ ratio_hp_freq, ratio_hp_time, ratio_hp_spec = spectrogram(ratio_highpass_audio, fs, 'hamming',
102
+ len(ratio_highpass_audio),
103
+ len(ratio_highpass_audio)-1,
104
+ nfft, 'constant', True, 'spectrum')
105
+ centroid_hp_freq, centroid_hp_time, centroid_hp_spec = spectrogram(centroid_highpass_audio, fs, 'hamming',
106
+ len(centroid_highpass_audio),
107
+ len(centroid_highpass_audio)-1,
108
+ nfft, 'constant', True, 'spectrum')
109
+
110
+ # initialise variables for storing data
111
+ all_ratio = []
112
+ all_hp_centroid = []
113
+ all_tpower = []
114
+ all_hp_centroid_tpower = []
115
+
116
+ # set threshold level at zero
117
+ threshold_db = threshold
118
+ if threshold_db == 0:
119
+ threshold = 0
120
+ hp_threshold = 0
121
+ else:
122
+ max_power = max(np.sum(ratio_all_spec, axis=1))
123
+ threshold = max_power * timbral_util.db2mag(threshold_db)
124
+ # get the threshold for centroid
125
+ # centroid_hp_max_power = max(np.sum(centroid_hp_spec, axis=1))
126
+ # hp_min_power = min(np.sum(hp_spec, axis=1))
127
+ # hp_threshold = hp_max_power * timbral_util.db2mag(threshold_db)
128
+ # threshold = 0.0
129
+
130
+ '''
131
+ Calculate features for each time window
132
+ '''
133
+ for idx in range(len(ratio_hp_time)): #
134
+ # get the current spectrum for this time window
135
+ current_ratio_hp_spec = ratio_hp_spec[:, idx]
136
+ current_ratio_all_spec = ratio_all_spec[:, idx]
137
+ current_centroid_hp_spec = centroid_hp_spec[:, idx]
138
+
139
+ # get the power within each spectrum
140
+ tpower = np.sum(current_ratio_all_spec)
141
+ hp_tpower = np.sum(current_ratio_hp_spec)
142
+ # check there is energy in the time window before calculating the ratio (greater than 0)
143
+ if tpower > threshold:
144
+ # get the ratio
145
+ all_ratio.append(hp_tpower / tpower)
146
+ # store the powef for weighting
147
+ all_tpower.append(tpower)
148
+
149
+ # get the tpower to assure greater than zero
150
+ hp_centroid_tpower = np.sum(current_centroid_hp_spec)
151
+ if hp_centroid_tpower > 0.0:
152
+ # get the centroid
153
+ all_hp_centroid.append(np.sum(current_centroid_hp_spec * centroid_hp_freq[:len(current_centroid_hp_spec)]) /
154
+ np.sum(current_centroid_hp_spec))
155
+ # store the tpower for weighting
156
+ all_hp_centroid_tpower.append(hp_centroid_tpower)
157
+
158
+ '''
159
+ Get mean and weighted average values
160
+ '''
161
+ mean_ratio = np.mean(all_ratio)
162
+ mean_hp_centroid = np.mean(all_hp_centroid)
163
+
164
+ weighted_mean_ratio = np.average(all_ratio, weights=all_tpower)
165
+ weighted_mean_hp_centroid = np.average(all_hp_centroid, weights=all_hp_centroid_tpower)
166
+
167
+ if dev_output:
168
+ # return the ratio and centroid
169
+ return np.log10(weighted_mean_ratio), np.log10(weighted_mean_hp_centroid)
170
+ else:
171
+ # perform thye linear regression
172
+ all_metrics = np.ones(3)
173
+ all_metrics[0] = np.log10(weighted_mean_ratio)
174
+ all_metrics[1] = np.log10(weighted_mean_hp_centroid)
175
+ # all_metrics[2] = np.log10(weighted_mean_ratio) * np.log10(weighted_mean_hp_centroid)
176
+
177
+
178
+ coefficients = np.array([4.613128018020465, 17.378889309312974, 17.434733750553022])
179
+
180
+ # coefficients = np.array([-2.9197705625030235, 9.048261758526614, 3.940747859061009, 47.989783427908705])
181
+ bright = np.sum(all_metrics * coefficients)
182
+
183
+ if clip_output:
184
+ bright = timbral_util.output_clip(bright)
185
+
186
+ return bright
timbral_models/Timbral_Depth.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from scipy.signal import spectrogram
5
+ import scipy.stats
6
+ from . import timbral_util
7
+
8
+
9
+ def timbral_depth(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, threshold_db=-60,
10
+ low_frequency_limit=20, centroid_crossover_frequency=2000, ratio_crossover_frequency=500,
11
+ db_decay_threshold=-40):
12
+ """
13
+ This function calculates the apparent Depth of an audio file.
14
+ This version of timbral_depth contains self loudness normalising methods and can accept arrays as an input
15
+ instead of a string filename.
16
+
17
+ Version 0.4
18
+
19
+ Required parameter
20
+ :param fname: string or numpy array
21
+ string, audio filename to be analysed, including full file path and extension.
22
+ numpy array, array of audio samples, requires fs to be set to the sample rate.
23
+
24
+ Optional parameters
25
+ :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
26
+ Defaults to 0.
27
+ :param phase_correction: bool, perform phase checking before summing to mono. Defaults to False.
28
+ :param dev_output: bool, when False return the depth, when True return all extracted
29
+ features. Default to False.
30
+ :param threshold_db: float/int (negative), threshold, in dB, for calculating centroids.
31
+ Should be negative. Defaults to -60.
32
+ :param low_frequency_limit: float/int, low frequency limit at which to highpass filter the audio, in Hz.
33
+ Defaults to 20.
34
+ :param centroid_crossover_frequency: float/int, crossover frequency for calculating the spectral centroid, in Hz.
35
+ Defaults to 2000
36
+ :param ratio_crossover_frequency: float/int, crossover frequency for calculating the ratio, in Hz.
37
+ Defaults to 500.
38
+
39
+ :param db_decay_threshold: float/int (negative), threshold, in dB, for estimating duration. Should be
40
+ negative. Defaults to -40.
41
+
42
+ :return: float, aparent depth of audio file, float.
43
+
44
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
45
+
46
+ Licensed under the Apache License, Version 2.0 (the "License");
47
+ you may not use this file except in compliance with the License.
48
+ You may obtain a copy of the License at
49
+
50
+ http://www.apache.org/licenses/LICENSE-2.0
51
+
52
+ Unless required by applicable law or agreed to in writing, software
53
+ distributed under the License is distributed on an "AS IS" BASIS,
54
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
55
+ See the License for the specific language governing permissions and
56
+ limitations under the License.
57
+ """
58
+ '''
59
+ Read input
60
+ '''
61
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
62
+
63
+ '''
64
+ Filter audio
65
+ '''
66
+ # highpass audio - run 3 times to get -18dB per octave - unstable filters produced when using a 6th order
67
+ audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs)
68
+ audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs)
69
+ audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs)
70
+
71
+ # running 3 times to get -18dB per octave rolloff, greater than second order filters are unstable in python
72
+ lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(audio_samples,crossover=centroid_crossover_frequency,fs=fs)
73
+ lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(lowpass_centroid_audio_samples,crossover=centroid_crossover_frequency,fs=fs)
74
+ lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(lowpass_centroid_audio_samples,crossover=centroid_crossover_frequency,fs=fs)
75
+
76
+ lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(audio_samples,crossover=ratio_crossover_frequency,fs=fs)
77
+ lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(lowpass_ratio_audio_samples,crossover=ratio_crossover_frequency,fs=fs)
78
+ lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(lowpass_ratio_audio_samples,crossover=ratio_crossover_frequency,fs=fs)
79
+
80
+ '''
81
+ Get spectrograms and normalise
82
+ '''
83
+ # normalise audio
84
+ lowpass_ratio_audio_samples *= (1.0 / max(abs(audio_samples)))
85
+ lowpass_centroid_audio_samples *= (1.0 / max(abs(audio_samples)))
86
+ audio_samples *= (1.0 / max(abs(audio_samples)))
87
+
88
+ # set FFT parameters
89
+ nfft = 4096
90
+ hop_size = int(3 * nfft / 4)
91
+ # get spectrogram
92
+ if len(audio_samples) > nfft:
93
+ freq, time, spec = spectrogram(audio_samples, fs, 'hamming', nfft, hop_size,
94
+ nfft, 'constant', True, 'spectrum')
95
+ lp_centroid_freq, lp_centroid_time, lp_centroid_spec = spectrogram(lowpass_centroid_audio_samples, fs,
96
+ 'hamming', nfft, hop_size, nfft,
97
+ 'constant', True, 'spectrum')
98
+ lp_ratio_freq, lp_ratio_time, lp_ratio_spec = spectrogram(lowpass_ratio_audio_samples, fs, 'hamming', nfft,
99
+ hop_size, nfft, 'constant', True, 'spectrum')
100
+
101
+ else:
102
+ # file is shorter than 4096, just take the fft
103
+ freq, time, spec = spectrogram(audio_samples, fs, 'hamming', len(audio_samples), len(audio_samples)-1,
104
+ nfft, 'constant', True, 'spectrum')
105
+ lp_centroid_freq, lp_centroid_time, lp_centroid_spec = spectrogram(lowpass_centroid_audio_samples, fs,
106
+ 'hamming',
107
+ len(lowpass_centroid_audio_samples),
108
+ len(lowpass_centroid_audio_samples)-1,
109
+ nfft, 'constant', True, 'spectrum')
110
+ lp_ratio_freq, lp_ratio_time, lp_ratio_spec = spectrogram(lowpass_ratio_audio_samples, fs, 'hamming',
111
+ len(lowpass_ratio_audio_samples),
112
+ len(lowpass_ratio_audio_samples)-1,
113
+ nfft, 'constant', True, 'spectrum')
114
+
115
+
116
+
117
+ threshold = timbral_util.db2mag(threshold_db)
118
+
119
+
120
+ '''
121
+ METRIC 1 - limited weighted mean normalised lower centroid
122
+ '''
123
+ # define arrays for storing metrics
124
+ all_normalised_lower_centroid = []
125
+ all_normalised_centroid_tpower = []
126
+
127
+ # get metrics for each time segment of the spectrogram
128
+ for idx in range(len(time)):
129
+ # get overall spectrum of time frame
130
+ current_spectrum = spec[:, idx]
131
+ # calculate time window power
132
+ tpower = np.sum(current_spectrum)
133
+ all_normalised_centroid_tpower.append(tpower)
134
+
135
+ # estimate if time segment contains audio energy or just noise
136
+ if tpower > threshold:
137
+ # get the spectrum
138
+ lower_spectrum = lp_centroid_spec[:, idx]
139
+ lower_power = np.sum(lower_spectrum)
140
+
141
+ # get lower centroid
142
+ lower_centroid = np.sum(lower_spectrum * lp_centroid_freq) / float(lower_power)
143
+
144
+ # append to list
145
+ all_normalised_lower_centroid.append(lower_centroid)
146
+ else:
147
+ all_normalised_lower_centroid.append(0)
148
+
149
+ # calculate the weighted mean of lower centroids
150
+ weighted_mean_normalised_lower_centroid = np.average(all_normalised_lower_centroid,
151
+ weights=all_normalised_centroid_tpower)
152
+ # limit to the centroid crossover frequency
153
+ if weighted_mean_normalised_lower_centroid > centroid_crossover_frequency:
154
+ limited_weighted_mean_normalised_lower_centroid = np.float64(centroid_crossover_frequency)
155
+ else:
156
+ limited_weighted_mean_normalised_lower_centroid = weighted_mean_normalised_lower_centroid
157
+
158
+
159
+
160
+ '''
161
+ METRIC 2 - weighted mean normalised lower ratio
162
+ '''
163
+ # define arrays for storing metrics
164
+ all_normalised_lower_ratio = []
165
+ all_normalised_ratio_tpower = []
166
+
167
+ # get metrics for each time segment of the spectrogram
168
+ for idx in range(len(time)):
169
+ # get time frame of broadband spectrum
170
+ current_spectrum = spec[:, idx]
171
+ tpower = np.sum(current_spectrum)
172
+ all_normalised_ratio_tpower.append(tpower)
173
+
174
+ # estimate if time segment contains audio energy or just noise
175
+ if tpower > threshold:
176
+ # get the lowpass spectrum
177
+ lower_spectrum = lp_ratio_spec[:, idx]
178
+ # get the power of this
179
+ lower_power = np.sum(lower_spectrum)
180
+ # get the ratio of LF to all energy
181
+ lower_ratio = lower_power / float(tpower)
182
+ # append to array
183
+ all_normalised_lower_ratio.append(lower_ratio)
184
+ else:
185
+ all_normalised_lower_ratio.append(0)
186
+
187
+ # calculate
188
+ weighted_mean_normalised_lower_ratio = np.average(all_normalised_lower_ratio, weights=all_normalised_ratio_tpower)
189
+
190
+ '''
191
+ METRIC 3 - Approximate duration/decay-time of sample
192
+ '''
193
+ all_my_duration = []
194
+
195
+ # get envelpe of signal
196
+ envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs)
197
+ # estimate onsets
198
+ onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs)
199
+
200
+ # get RMS envelope - better follows decays than the sample-and-hold
201
+ rms_step_size = 256
202
+ rms_envelope = timbral_util.calculate_rms_enveope(audio_samples, step_size=rms_step_size)
203
+
204
+ # convert decay threshold to magnitude
205
+ decay_threshold = timbral_util.db2mag(db_decay_threshold)
206
+ # rescale onsets to rms stepsize - casting to int
207
+ time_convert = fs / float(rms_step_size)
208
+ onsets = (np.array(onsets) / float(rms_step_size)).astype('int')
209
+
210
+ for idx, onset in enumerate(onsets):
211
+ if onset == onsets[-1]:
212
+ segment = rms_envelope[onset:]
213
+ else:
214
+ segment = rms_envelope[onset:onsets[idx + 1]]
215
+
216
+ # get location of max RMS frame
217
+ max_idx = np.argmax(segment)
218
+ # get the segment from this max until the next onset
219
+ post_max_segment = segment[max_idx:]
220
+
221
+ # estimate duration based on decay or until next onset
222
+ if min(post_max_segment) >= decay_threshold:
223
+ my_duration = len(post_max_segment) / time_convert
224
+ else:
225
+ my_duration = np.where(post_max_segment < decay_threshold)[0][0] / time_convert
226
+
227
+ # append to array
228
+ all_my_duration.append(my_duration)
229
+
230
+ # calculate the lof of mean duration
231
+ mean_my_duration = np.log10(np.mean(all_my_duration))
232
+
233
+
234
+ '''
235
+ METRIC 4 - f0 estimation with peak picking
236
+ '''
237
+ # get the overall spectrum
238
+ all_spectrum = np.sum(spec, axis=1)
239
+ # normalise this
240
+ norm_spec = (all_spectrum - np.min(all_spectrum)) / (np.max(all_spectrum) - np.min(all_spectrum))
241
+ # set limit for peak picking
242
+ cthr = 0.01
243
+ # detect peaks
244
+ peak_idx, peak_value, peak_freq = timbral_util.detect_peaks(norm_spec, cthr=cthr, unprocessed_array=norm_spec,
245
+ freq=freq)
246
+ # estimate peak
247
+ pitch_estimate = np.log10(min(peak_freq)) if peak_freq[0] > 0 else 0
248
+
249
+
250
+ # get outputs
251
+ if dev_output:
252
+ return limited_weighted_mean_normalised_lower_centroid, weighted_mean_normalised_lower_ratio, mean_my_duration, \
253
+ pitch_estimate, weighted_mean_normalised_lower_ratio * mean_my_duration, \
254
+ timbral_util.sigmoid(weighted_mean_normalised_lower_ratio) * mean_my_duration
255
+ else:
256
+ '''
257
+ Perform linear regression to obtain depth
258
+ '''
259
+ # coefficients from linear regression
260
+ coefficients = np.array([-0.0043703565847874465, 32.83743202462131, 4.750862716905235, -14.217438690256062,
261
+ 3.8782339862813924, -0.8544826091735516, 66.69534393444391])
262
+
263
+ # what are the best metrics
264
+ metric1 = limited_weighted_mean_normalised_lower_centroid
265
+ metric2 = weighted_mean_normalised_lower_ratio
266
+ metric3 = mean_my_duration
267
+ metric4 = pitch_estimate
268
+ metric5 = metric2 * metric3
269
+ metric6 = timbral_util.sigmoid(metric2) * metric3
270
+
271
+ # pack metrics into a matrix
272
+ all_metrics = np.zeros(7)
273
+
274
+ all_metrics[0] = metric1
275
+ all_metrics[1] = metric2
276
+ all_metrics[2] = metric3
277
+ all_metrics[3] = metric4
278
+ all_metrics[4] = metric5
279
+ all_metrics[5] = metric6
280
+ all_metrics[6] = 1.0
281
+
282
+ # perform linear regression
283
+ depth = np.sum(all_metrics * coefficients)
284
+
285
+ if clip_output:
286
+ depth = timbral_util.output_clip(depth)
287
+
288
+ return depth
289
+
timbral_models/Timbral_Extractor.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import soundfile as sf
3
+ import numpy as np
4
+ import six
5
+ from . import timbral_util, timbral_hardness, timbral_depth, timbral_brightness, timbral_roughness, timbral_warmth, \
6
+ timbral_sharpness, timbral_booming, timbral_reverb
7
+
8
+ def timbral_extractor(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, output_type='dictionary', verbose=True):
9
+ """
10
+ The Timbral Extractor will extract all timbral attribute sin one function call, returning the results as either
11
+ a list or dictionary, depending on input definitions.
12
+
13
+ Version 0.4
14
+
15
+ Simply calls each function with
16
+
17
+ Required parameter
18
+ :param fname: string or numpy array
19
+ string, audio filename to be analysed, including full file path and extension.
20
+ numpy array, array of audio samples, requires fs to be set to the sample rate.
21
+
22
+ Optional parameters
23
+ :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
24
+ Defaults to 0.
25
+ :param phase_correction: bool, perform phase checking before summing to mono. Defaults to False.
26
+ :param dev_output: bool, when False return the depth, when True return all extracted
27
+ features. Default to False.
28
+ :param clip_output: bool, force the output to be between 0 and 100.
29
+ :param output_type: string, defines the type the output should be formatted in. Accepts either
30
+ 'dictionary' or 'list' as parameters. Default to 'dictionary'.
31
+
32
+ :return: timbre the results from all timbral attributes as either a dictionary or list, depending
33
+ on output_type.
34
+
35
+ Copyright 2019 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
36
+ """
37
+ '''
38
+ Check output_type before calculating anything
39
+ '''
40
+ if output_type != 'dictionary' and output_type != 'list':
41
+ raise ValueError('output_type must be \'dictionary\' or \'list\'.')
42
+
43
+ '''
44
+ Basic audio reading
45
+ '''
46
+ if isinstance(fname, six.string_types):
47
+ # read audio file only once and pass arrays to algorithms
48
+ try:
49
+ audio_samples, fs = sf.read(fname)
50
+ # making an array again for copying purposes
51
+ multi_channel_audio = np.array(audio_samples)
52
+ except:
53
+ print('Soundfile failed to load: ' + str(fname))
54
+ raise TypeError('Unable to read audio file.')
55
+ elif hasattr(fname, 'shape'):
56
+ if fs==0:
57
+ raise ValueError('If giving function an array, \'fs\' must be specified')
58
+ audio_samples = fname
59
+ multi_channel_audio = np.array(fname)
60
+ else:
61
+ raise ValueError('Input must be either a string or a numpy array.')
62
+
63
+ # channel reduction
64
+ audio_samples = timbral_util.channel_reduction(audio_samples)
65
+
66
+ # resample audio file if sample rate is less than 44100
67
+ audio_samples, fs = timbral_util.check_upsampling(audio_samples, fs)
68
+
69
+ # functions can be given audio samples as well
70
+ if verbose:
71
+ print('Calculating hardness...')
72
+ hardness = timbral_hardness(audio_samples, fs=fs,
73
+ dev_output=dev_output,
74
+ phase_correction=phase_correction,
75
+ clip_output=clip_output)
76
+ if verbose:
77
+ print('Calculating depth...')
78
+ depth = timbral_depth(audio_samples, fs=fs,
79
+ dev_output=dev_output,
80
+ phase_correction=phase_correction,
81
+ clip_output=clip_output)
82
+ if verbose:
83
+ print('Calculating brightness...')
84
+ brightness = timbral_brightness(audio_samples, fs=fs,
85
+ dev_output=dev_output,
86
+ phase_correction=phase_correction,
87
+ clip_output=clip_output)
88
+ if verbose:
89
+ print('Calculating roughness...')
90
+ roughness = timbral_roughness(audio_samples, fs=fs,
91
+ dev_output=dev_output,
92
+ phase_correction=phase_correction,
93
+ clip_output=clip_output)
94
+ if verbose:
95
+ print('Calculating warmth...')
96
+ warmth = timbral_warmth(audio_samples, fs=fs,
97
+ dev_output=dev_output,
98
+ phase_correction=phase_correction,
99
+ clip_output=clip_output)
100
+ if verbose:
101
+ print('Calculating sharpness...')
102
+ sharpness = timbral_sharpness(audio_samples, fs=fs,
103
+ dev_output=dev_output,
104
+ phase_correction=phase_correction,
105
+ clip_output=clip_output)
106
+ if verbose:
107
+ print('Calculating boominess...')
108
+ boominess = timbral_booming(audio_samples, fs=fs,
109
+ dev_output=dev_output,
110
+ phase_correction=phase_correction,
111
+ clip_output=clip_output)
112
+ if verbose:
113
+ print('Calculating reverb...')
114
+ # reverb calculated on all channels
115
+ reverb = timbral_reverb(multi_channel_audio, fs=fs)
116
+
117
+ '''
118
+ Format output
119
+ '''
120
+ if output_type=='dictionary':
121
+ timbre = {
122
+ 'hardness': hardness,
123
+ 'depth': depth,
124
+ 'brightness': brightness,
125
+ 'roughness': roughness,
126
+ 'warmth': warmth,
127
+ 'sharpness': sharpness,
128
+ 'boominess': boominess,
129
+ 'reverb': reverb
130
+ }
131
+ elif output_type == 'list':
132
+ timbre = [hardness, depth, brightness, roughness, warmth, sharpness, boominess, reverb]
133
+ else:
134
+ raise ValueError('output_type must be \'dictionary\' or \'list\'.')
135
+
136
+
137
+ return timbre
138
+
139
+
140
+
timbral_models/Timbral_Hardness.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import librosa
4
+ import soundfile as sf
5
+ import six
6
+ from scipy.signal import spectrogram
7
+ from . import timbral_util
8
+
9
+ def timbral_hardness(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, max_attack_time=0.1,
10
+ bandwidth_thresh_db=-75):
11
+ """
12
+ This function calculates the apparent hardness of an audio file.
13
+ This version of timbral_hardness contains self loudness normalising methods and can accept arrays as an input
14
+ instead of a string filename.
15
+
16
+ Version 0.4
17
+
18
+ Required parameter
19
+ :param fname: string or numpy array
20
+ string, audio filename to be analysed, including full file path and extension.
21
+ numpy array, array of audio samples, requires fs to be set to the sample rate.
22
+
23
+ Optional parameters
24
+ :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
25
+ Defaults to 0.
26
+ :param phase_correction: bool, perform phase checking before summing to mono. Defaults to False.
27
+ :param dev_output: bool, when False return the depth, when True return all extracted
28
+ features. Default to False.
29
+ :param clip_output: bool, force the output to be between 0 and 100.
30
+ :param max_attack_time: float, set the maximum attack time, in seconds. Defaults to 0.1.
31
+ :param bandwidth_thresh_db: float, set the threshold for calculating the bandwidth, Defaults to -75dB.
32
+
33
+
34
+ :return: float, Apparent hardness of audio file, float (dev_output = False/default).
35
+ With dev_output set to True returns the weighted mean bandwidth,
36
+ mean attack time, harmonic-percussive ratio, and unitless attack centroid.
37
+
38
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
39
+
40
+ Licensed under the Apache License, Version 2.0 (the "License");
41
+ you may not use this file except in compliance with the License.
42
+ You may obtain a copy of the License at
43
+
44
+ http://www.apache.org/licenses/LICENSE-2.0
45
+
46
+ Unless required by applicable law or agreed to in writing, software
47
+ distributed under the License is distributed on an "AS IS" BASIS,
48
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49
+ See the License for the specific language governing permissions and
50
+ limitations under the License.
51
+ """
52
+
53
+ '''
54
+ Read input
55
+ '''
56
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
57
+
58
+ '''
59
+ Calculate the midband level
60
+ '''
61
+ # get the level in the midband
62
+ midband_level, weighed_midband_level = timbral_util.weighted_bark_level(audio_samples, fs, low_bark_band=70,
63
+ upper_bark_band=140)
64
+ log_weighted_midband_level = np.log10(weighed_midband_level)
65
+
66
+ '''
67
+ Calculate the harmonic-percussive ratio pre zero-padding the signal
68
+ '''
69
+ HP_ratio = timbral_util.get_percussive_audio(audio_samples, return_ratio=True)
70
+ log_HP_ratio = np.log10(HP_ratio)
71
+
72
+ '''
73
+ Zeropad the signal
74
+ '''
75
+ # zero pad the signal
76
+ nperseg = 4096 # default value for spectrogram analysis
77
+ audio_samples = np.lib.pad(audio_samples, (nperseg+1, 0), 'constant', constant_values=(0.0, 0.0))
78
+
79
+ '''
80
+ Calculate the envelope and onsets
81
+ '''
82
+ # calculate the envelope of the signal
83
+ envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.1)
84
+ envelope_time = np.arange(len(envelope)) / fs
85
+
86
+ # calculate the onsets
87
+ original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg)
88
+ onset_strength = librosa.onset.onset_strength(y=audio_samples, sr=fs)
89
+ # If onsets don't exist, set it to time zero
90
+ if not original_onsets:
91
+ original_onsets = [0]
92
+ # set to start of file in the case where there is only one onset
93
+ if len(original_onsets) == 1:
94
+ original_onsets = [0]
95
+
96
+ onsets = np.array(original_onsets) - nperseg
97
+ onsets[onsets < 0] = 0
98
+
99
+ '''
100
+ Calculate the spectrogram so that the bandwidth can be created
101
+ '''
102
+ bandwidth_step_size = 128
103
+ mag = timbral_util.db2mag(bandwidth_thresh_db) # calculate threshold in linear from dB
104
+ bandwidth, t, f = timbral_util.get_bandwidth_array(audio_samples, fs, nperseg=nperseg,
105
+ overlap_step=bandwidth_step_size, rolloff_thresh=mag,
106
+ normalisation_method='none')
107
+ # bandwidth sample rate
108
+ bandwidth_fs = fs / float(bandwidth_step_size) # fs due to spectrogram step size
109
+
110
+ '''
111
+ Set all parameters for holding data per onset
112
+ '''
113
+ all_bandwidth_max = []
114
+ all_attack_time = []
115
+ all_max_strength = []
116
+ all_max_strength_bandwidth = []
117
+ all_attack_centroid = []
118
+
119
+ '''
120
+ Get bandwidth onset times and max bandwidth
121
+ '''
122
+ bandwidth_onset = np.array(onsets / float(bandwidth_step_size)).astype('int') # overlap_step=128
123
+
124
+ '''
125
+ Iterate through onsets and calculate metrics for each
126
+ '''
127
+ for onset_count in range(len(bandwidth_onset)):
128
+ '''
129
+ Calculate the bandwidth max for the attack portion of the onset
130
+ '''
131
+ # get the section of the bandwidth array between onsets
132
+ onset = bandwidth_onset[onset_count]
133
+ if onset == bandwidth_onset[-1]:
134
+ bandwidth_seg = np.array(bandwidth[onset:])
135
+ else:
136
+ next_onset = bandwidth_onset[onset_count + 1]
137
+ bandwidth_seg = np.array(bandwidth[onset:next_onset])
138
+
139
+ if max(bandwidth_seg) > 0:
140
+ # making a copy of the bandqwidth segment to avoid array changes
141
+ hold_bandwidth_seg = list(bandwidth_seg)
142
+
143
+ # calculate onset of the attack in the bandwidth array
144
+ if max(bandwidth_seg) > 0:
145
+ bandwidth_attack = timbral_util.calculate_attack_time(bandwidth_seg, bandwidth_fs,
146
+ calculation_type='fixed_threshold',
147
+ max_attack_time=max_attack_time)
148
+ else:
149
+ bandwidth_attack = []
150
+
151
+ # calculate the badiwdth max for the attack portion
152
+ if bandwidth_attack:
153
+ start_idx = bandwidth_attack[2]
154
+ if max_attack_time > 0:
155
+ max_attack_time_samples = int(max_attack_time * bandwidth_fs)
156
+ if len(hold_bandwidth_seg[start_idx:]) > start_idx+max_attack_time_samples:
157
+ all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:start_idx+max_attack_time_samples]))
158
+ else:
159
+ all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:]))
160
+ else:
161
+ all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:]))
162
+ else:
163
+ # set as blank so bandwith
164
+ bandwidth_attack = []
165
+
166
+ '''
167
+ Calculate the attack time
168
+ '''
169
+ onset = original_onsets[onset_count]
170
+ if onset == original_onsets[-1]:
171
+ attack_seg = np.array(envelope[onset:])
172
+ strength_seg = np.array(onset_strength[int(onset/512):]) # 512 is librosa default window size
173
+ audio_seg = np.array(audio_samples[onset:])
174
+ else:
175
+ attack_seg = np.array(envelope[onset:original_onsets[onset_count + 1]])
176
+ strength_seg = np.array(onset_strength[int(onset/512):int(original_onsets[onset_count+1]/512)])
177
+ audio_seg = np.array(audio_samples[onset:original_onsets[onset_count + 1]])
178
+
179
+ attack_time = timbral_util.calculate_attack_time(attack_seg, fs, max_attack_time=max_attack_time)
180
+ all_attack_time.append(attack_time[0])
181
+
182
+ '''
183
+ Get the attack strength for weighting the bandwidth max
184
+ '''
185
+ all_max_strength.append(max(strength_seg))
186
+ if bandwidth_attack:
187
+ all_max_strength_bandwidth.append(max(strength_seg))
188
+
189
+ '''
190
+ Get the spectral centroid of the attack (125ms after attack start)
191
+ '''
192
+ # identify the start of the attack
193
+ th_start_idx = attack_time[2]
194
+ # define how long the attack time can be
195
+ centroid_int_samples = int(0.125 * fs) # number of samples for attack time integration
196
+
197
+ # start of attack section from attack time calculation
198
+ if th_start_idx + centroid_int_samples >= len(audio_seg):
199
+ audio_seg = audio_seg[th_start_idx:]
200
+ else:
201
+ audio_seg = audio_seg[th_start_idx:th_start_idx + centroid_int_samples]
202
+
203
+ # check that there's a suitable legnth of samples to get attack centroid
204
+ # minimum length arbitrarily set to 512 samples
205
+ if len(audio_seg) > 512:
206
+ # get all spectral features for this attack section
207
+ spectral_features_hold = timbral_util.get_spectral_features(audio_seg, fs)
208
+
209
+ # store unitless attack centroid if exists
210
+ if spectral_features_hold:
211
+ all_attack_centroid.append(spectral_features_hold[0])
212
+
213
+ '''
214
+ Calculate mean and weighted average values for features
215
+ '''
216
+ # attack time
217
+ mean_attack_time = np.mean(all_attack_time)
218
+
219
+ # get the weighted mean of bandwidth max and limit lower value
220
+ if len(all_bandwidth_max):
221
+ mean_weighted_bandwidth_max = np.average(all_bandwidth_max, weights=all_max_strength_bandwidth)
222
+ # check for zero values so the log bandwidth max can be taken
223
+ if mean_weighted_bandwidth_max <= 512.0:
224
+ mean_weighted_bandwidth_max = fs / 512.0 # minimum value
225
+ else:
226
+ mean_weighted_bandwidth_max = fs / 512.0 # minimum value
227
+
228
+ # take the logarithm
229
+ log_weighted_bandwidth_max = np.log10(mean_weighted_bandwidth_max)
230
+
231
+ # get the mean of the onset strenths
232
+ mean_max_strength = np.mean(all_max_strength)
233
+ log_mean_max_strength = np.log10(mean_max_strength)
234
+
235
+ if all_attack_centroid:
236
+ mean_attack_centroid = np.mean(all_attack_centroid)
237
+ else:
238
+ mean_attack_centroid = 200.0
239
+
240
+ # limit the lower limit of the attack centroid to allow for log to be taken
241
+ if mean_attack_centroid <= 200:
242
+ mean_attack_centroid = 200.0
243
+ log_attack_centroid = np.log10(mean_attack_centroid)
244
+
245
+ '''
246
+ Either return the raw features, or calculaste the linear regression.
247
+ '''
248
+ if dev_output:
249
+ return log_weighted_bandwidth_max, log_attack_centroid, log_weighted_midband_level, log_HP_ratio, log_mean_max_strength, mean_attack_time
250
+ else:
251
+ '''
252
+ Apply regression model
253
+ '''
254
+ all_metrics = np.ones(7)
255
+ all_metrics[0] = log_weighted_bandwidth_max
256
+ all_metrics[1] = log_attack_centroid
257
+ all_metrics[2] = log_weighted_midband_level
258
+ all_metrics[3] = log_HP_ratio
259
+ all_metrics[4] = log_mean_max_strength
260
+ all_metrics[5] = mean_attack_time
261
+
262
+ # coefficients = np.array([13.5330599736, 18.1519030059, 13.1679266873, 5.03134507433, 5.22582123237, -3.71046018962, -89.8935449357])
263
+
264
+ # recalculated values when using loudnorm
265
+ coefficients = np.array([12.079781720638145, 18.52100377170042, 14.139883645260355, 5.567690321917516,
266
+ 3.9346817690405635, -4.326890461087848, -85.60352209068202])
267
+
268
+ hardness = np.sum(all_metrics * coefficients)
269
+
270
+ # clip output between 0 and 100
271
+ if clip_output:
272
+ hardness = timbral_util.output_clip(hardness)
273
+
274
+ return hardness
timbral_models/Timbral_Reverb.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ import six
5
+ from scipy.signal import spectrogram
6
+ from . import timbral_util
7
+
8
+ def timbral_reverb(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False):
9
+ """
10
+ This function classifies the audio file as either not sounding reverberant.
11
+
12
+ This is based on the RT60 estimation algirhtm documented in:
13
+ Jan, T., and Wang, W., 2012: "Blind reverberation time estimation based on Laplace distribution",
14
+ EUSIPCO. pp. 2050-2054, Bucharest, Romania.
15
+
16
+ Version 0.4
17
+
18
+ Required parameter
19
+ :param fname: string or numpy array
20
+ string, audio filename to be analysed, including full file path and extension.
21
+ numpy array, array of audio samples, requires fs to be set to the sample rate.
22
+
23
+ Optional parameters
24
+ :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate.
25
+ Defaults to 0.
26
+ :param phase_correction: Has no effect on the code. Implemented for consistency with other timbral
27
+ functions.
28
+ :param dev_output: Has no effect on the code. Implemented for consistency with other timbral
29
+ functions.
30
+ :param clip_output: Has no effect on the code. Implemented for consistency with other timbral
31
+ functions.
32
+
33
+ :return: predicted reverb of audio file. 1 represents the files osunds reverberant, 0
34
+ represents the files does not sound reverberant.
35
+
36
+ Copyright 2019 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
37
+ """
38
+ # needs to accept the input as audio file
39
+ raw_audio_samples, fs = timbral_util.file_read(fname, fs=fs, phase_correction=False, mono_sum=False, loudnorm=False)
40
+
41
+ # check for mono file
42
+ if len(raw_audio_samples.shape) < 2:
43
+ # it's a mono file
44
+ mean_RT60 = estimate_RT60(raw_audio_samples, fs)
45
+ else:
46
+ # the file has channels, estimate RT for the first two and take the mean
47
+ l_RT60 = estimate_RT60(raw_audio_samples[:, 0], fs)
48
+ r_RT60 = estimate_RT60(raw_audio_samples[:, 1], fs)
49
+
50
+ mean_RT60 = np.mean([l_RT60, r_RT60])
51
+
52
+ '''
53
+ need to develop a logistic regression model to test this.
54
+ '''
55
+ probability = reverb_logistic_regression(mean_RT60)
56
+
57
+ if dev_output:
58
+ return mean_RT60, probability
59
+ else:
60
+ if probability < 0.5:
61
+ return 0
62
+ else:
63
+ return 1
64
+
65
+
66
+ def estimate_RT60(audio_samples, fs):
67
+
68
+ ''' No chanel rediuction, perform on each channel '''
69
+
70
+ # function[rt_est, par] = RT_estimation_my(y, fs)
71
+
72
+ # performs blind RT estimation
73
+ # INPUT
74
+ # y: reverberant speech
75
+ # fs: sampling frequency
76
+ #
77
+ # OUTPUT
78
+ # rt_est: estimated RT
79
+ # par: struct with parameters used to execute the function
80
+ # rt_estimate_frame_my.m
81
+ #
82
+ # Codes were adapted from the original codes by Heinrich Loellmann, IND, RWTH Aachen
83
+ #
84
+ # Authors: Tariqullah Jan, moderated by Wenwu Wang, University of Surrey(2012)
85
+
86
+ '''
87
+ Initialisation
88
+ '''
89
+ # ---------------------------------------------
90
+
91
+ par = init_rt_estimate_e(fs) # struct with all parameters and buffers for frame-wise processing
92
+ BL = par['N'] * par['down'] # to simplify notation
93
+
94
+ Laudio = len(audio_samples)
95
+
96
+ # check audio file is long enough for analysis
97
+ if BL < Laudio:
98
+ rt_est = [] #np.zeros(int(round(Laudio / par['N_shift'])))
99
+ RT_final = []
100
+
101
+ '''
102
+ frame-wise processing in the time-domain
103
+ '''
104
+ # ---------------------------------------------
105
+
106
+ k = 0
107
+ n_array = np.arange(0, Laudio - BL + 1, par['N_shift'])
108
+ for n in n_array:
109
+ k += 1 # frame counter
110
+ ind = np.arange(n, n + BL) # indices of current frame
111
+
112
+ # Actual RT estimation
113
+ RT, par, finalrt = rt_estimate_frame_my(audio_samples[ind[np.arange(0, len(ind), par['down'])]], par)
114
+
115
+ rt_est.append(RT) # store estimated value
116
+ RT_final.append(finalrt)
117
+ else:
118
+ # audio too short for analysis, for returning smallest Rt value
119
+ return par['Tquant'][0]
120
+
121
+ RT_final = np.clip(RT_final, 0, max(RT_final))
122
+ aaa = RT_final[np.where(RT_final>0)]
123
+
124
+
125
+ RT_temp_new = []
126
+ for i in range(1, len(aaa)):
127
+ RT_temp_new.append(0.49 * aaa[i - 1] + (1 - 0.49) * np.max(aaa))
128
+
129
+
130
+ if aaa.size:
131
+ RTfinal_value = np.min(aaa)
132
+
133
+ RT_temp_new = []
134
+ for i in range(1, len(aaa)):
135
+ RT_temp_new.append(0.49 * aaa[i - 1] + (1 - 0.49) * np.max(aaa))
136
+
137
+ else:
138
+ RTfinal_value = par['Tquant'][0]
139
+
140
+ rt_est = np.array(rt_est)
141
+ rt_est = rt_est[np.where(RT_final>0)]
142
+ if rt_est.size:
143
+ return np.mean(rt_est)
144
+ else:
145
+ return par['Tquant'][0]
146
+
147
+
148
+ def init_rt_estimate_e(fs=24000):
149
+ '''
150
+ par = init_rt_estimate_e(fs)
151
+ executes initialization for the function
152
+ rt_estimate_frame.m to perform a blind estimation of the reverberation time
153
+ (RT) by frame-wise processing in the time-domain.
154
+
155
+ INPUT
156
+ fs: sampling frequency(default=24 kHz)
157
+
158
+ OUTPUT
159
+ par: struct containing all parameters and buffer for executing the
160
+ function rt_estimate_frame.m
161
+
162
+ author: Heiner Loellmann, IND, RWTH Aachen University
163
+
164
+ created: August 2011
165
+
166
+ general paraemters
167
+ '''
168
+ par = {"fs":fs}
169
+ no = par['fs'] / 24000.0 # correction factor to account for different sampling frequency
170
+
171
+ # pararmeters for pre - selection of suitable segments
172
+ if par['fs'] > 8e3:
173
+ par['down'] = 2 # rate for downsampling applied before RT estimation to reduce computational complexity
174
+ else:
175
+ par['down'] = 1
176
+
177
+ par['N_sub'] = int(round(no * 700 / par['down'])) # sub-frame length(after downsampling)
178
+ par['N_shift'] = int(round(no * 200 / par['down'])) # frame shift(before downsampling)
179
+ par['nos_min'] = 3 # minimal number of subframes to detect a sound decay
180
+ par['nos_max'] = 7 # maximal number of subframes to detect a sound decay
181
+ par['N'] = int(par['nos_max'] * par['N_sub']) # maximal frame length(after downsampling)
182
+
183
+ # parameters for ML - estimation
184
+ Tmax = 1.1 # max RT being considered
185
+ Tmin = 0.2 #min RT being considered
186
+ par['bin'] = 0.1 # step-size for RT estimation
187
+ par['Tquant'] = np.arange(Tmin, Tmax+par['bin']/2, par['bin']) # set of qunatized RTs considered for maximum search
188
+ par['a'] = np.exp(-3.0 * np.log(10) / ( par['Tquant'] * (par['fs'] / par['down']))) # corresponding decay rate factors
189
+ par['La'] = len(par['a']) # num of considered decay rate factors( = no of.RTs)
190
+
191
+ # paramters for histogram - based approach to reduce outliers (order statistics)
192
+ par['buffer_size'] = int(round(no * 800 / par['down'])) # buffer size
193
+ par['buffer'] = np.zeros(par['buffer_size']) # buffer with previous indices to update histogram
194
+ par['no_bins'] = int(par['La']) # no. of histogram bins
195
+ par['hist_limits'] = np.arange(Tmin - par['bin'] / 2.0, Tmax + par['bin'], par['bin']) # limits of histogram bins
196
+ par['hist_rt'] = np.zeros(par['no_bins']) # histogram with ML estimates
197
+ par['hist_counter'] = 0 # counter increased if histogram is updated
198
+
199
+ # paramters for recursive smoothing of final RT estimate
200
+ par['alpha'] = 0.995 # smoothing factor
201
+ par['RT_initial'] = 0.3 # initial RT estimate
202
+ par['RT_last'] = par['RT_initial'] # last RT estimate
203
+ par['RT_raw'] = par['RT_initial'] # raw RT estimate obtained by histogram - approach
204
+
205
+ return par
206
+
207
+
208
+ def rt_estimate_frame_my(frame, par):
209
+ '''
210
+ performs an efficient blind estimation of the reverberation time(RT) for frame-wise
211
+ processing based on Laplacian distribution.
212
+
213
+ INPUT
214
+ frame: (time-domain) segment with reverberant speech
215
+ par: struct with all parameters and buffers created by the function
216
+ init_binaural_speech_enhancement_e.m
217
+
218
+ OUTPUT
219
+ RT: estimated RT
220
+ par: struct with updated buffers to enable a frame-wise processing
221
+ RT_pre: raw RT estimate(for debugging and analysis of the algorithm)
222
+
223
+ Reference: LAllmann, H.W., Jeub, M., Yilmaz, E., and Vary, P.:
224
+ An Improved Algorithm for Blind Reverberation Time Estimation, a
225
+ International Workshop on Acoustic Echo and Noise Control(IWAENC), Tel Aviv, Israel, Aug. 2010.
226
+
227
+ Tariqullah Jan and Wenwu Wang:
228
+ Blind reverberation time estimation based on Laplacian distribution
229
+ European Signal Processing Conference(EUSIPCO), 2012.
230
+
231
+ The codes were adapted based on the original codes by Heinrich Loellmann, IND, RWTH Aachen
232
+
233
+ Authors: Tariqullah Jan, moderated by Wenwu Wang, University of Surrey(2012)
234
+ '''
235
+ if len(np.shape(np.squeeze(frame))) > 1:
236
+ raise ValueError('Something went wrong...')
237
+
238
+ cnt = 0 # sub-frame counter for pre - selection of possible sound decay
239
+ RTml = -1 # default RT estimate (-1 indicates no new RT estimate)
240
+
241
+ # calculate variance, minimum and maximum of first sub-frame
242
+ seg = frame[:par['N_sub']]
243
+
244
+ var_pre = np.var(seg)
245
+ min_pre = np.min(seg)
246
+ max_pre = np.max(seg)
247
+
248
+ for k in range(2, par['nos_max']):
249
+ # calculate variance, minimum and maximum of succeding sub-frame
250
+ seg = frame[(k-1) * par['N_sub'] : k * par['N_sub']+1]
251
+ var_cur = np.var(seg)
252
+ max_cur = max(seg)
253
+ min_cur = min(seg)
254
+
255
+ #-- Pre-Selection of suitable speech decays --------------------
256
+ if (var_pre > var_cur) and (max_pre > max_cur) and (min_pre < min_cur):
257
+ # if variance, maximum decraease, and minimum increase
258
+ # = > possible sound decay detected
259
+
260
+ cnt += 1
261
+
262
+ # current values becomes previous values
263
+ var_pre = var_cur
264
+ max_pre = max_cur
265
+ min_pre = min_cur
266
+
267
+ else:
268
+ if cnt >= par['nos_min']:
269
+ # minimum length for assumed sound decay achieved?
270
+ # -- Maximum Likelihood(ML) Estimation of the RT
271
+ RTml, _ = max_loglf(frame[:cnt*par['N_sub']], par['a'], par['Tquant'])
272
+
273
+ break
274
+
275
+
276
+ if k == par['nos_max']:
277
+ # maximum frame length achieved?
278
+ RTml, _ = max_loglf(frame[0:cnt * par['N_sub']], par['a'], par['Tquant'])
279
+
280
+ # end of sub-frame loop
281
+
282
+ if RTml >= 0: # new ML estimate calculated
283
+
284
+ # apply order statistics to reduce outliers
285
+ par['hist_counter'] += 1
286
+
287
+ for i in range(par['no_bins']):
288
+
289
+ # find index corresponding to the ML estimate
290
+ # find index corresponding to the ML estimate
291
+ if (RTml >= par['hist_limits'][i]) and (RTml <= par['hist_limits'][i+1]):
292
+
293
+ index = i
294
+ break
295
+
296
+ # update histogram with ML estimates for the RT
297
+ par['hist_rt'][index] += 1
298
+
299
+ if par['hist_counter'] > par['buffer_size'] + 1:
300
+ # remove old values from histogram
301
+ par['hist_rt'][int(par['buffer'][0])] = par['hist_rt'][int(par['buffer'][0])] - 1
302
+
303
+ par['buffer'] = np.append(par['buffer'][1:], index) # % update buffer with indices
304
+ idx = np.argmax(par['hist_rt']) # find index for maximum of the histogram
305
+
306
+ par['RT_raw'] = par['Tquant'][idx] # map index to RT value
307
+
308
+
309
+ # final RT estimate obtained by recursive smoothing
310
+ RT = par['alpha'] * par['RT_last'] + (1 - par['alpha']) * par['RT_raw']
311
+ par['RT_last'] = RT
312
+
313
+ RT_pre = RTml # intermediate ML estimate for later analysis
314
+
315
+ return RT, par, RT_pre
316
+
317
+
318
+ def max_loglf(h, a, Tquant):
319
+ '''
320
+ [ML, ll] = max_loglf(h, a, Tquant)
321
+
322
+ returns the maximum of the log-likelihood(LL) function and the LL
323
+ function itself for a finite set of decay rates
324
+
325
+ INPUT
326
+ h: input frame
327
+ a: finite set of values for which the max.should be found
328
+ T: corresponding RT values for vector a
329
+
330
+ OUTPUT
331
+ ML: ML estimate for the RT
332
+ ll: underlying LL - function
333
+ '''
334
+
335
+ N = len(h)
336
+ n = np.arange(0, N) # indices for input vector
337
+ ll = np.zeros(len(a))
338
+
339
+ # transpose?
340
+ h_square = h.transpose()
341
+
342
+ for i in range(len(a)):
343
+ sum1 = np.dot((a[i] ** (-1.0 * n)), np.abs(h_square))
344
+ sum2 = np.sum(np.abs(h_square))
345
+ sigma = (1 / N) * sum1
346
+ ll[i] = -N * np.log(2) - N * np.log(sigma) - np.sum(np.log(a[i] ** n)) - (1 / sigma) * sum1
347
+
348
+
349
+ idx = np.argmax(ll) # maximum of the log-likelihood function
350
+ ML = Tquant[idx] # corresponding ML estimate for the RT
351
+
352
+ return ML, ll
353
+
354
+
355
+ def reverb_logistic_regression(mean_RT60):
356
+ """
357
+ Logistic regression function to determine if the file sound reverberant or not.
358
+ :param mean_RT60:
359
+ :return:
360
+ """
361
+ # apply linear coefficients
362
+ coefficients = [2.97126461]
363
+ intercept = -1.45082989
364
+ attributes = [mean_RT60]
365
+ logit_model = np.sum(np.array(coefficients) * np.array(attributes)) + intercept
366
+
367
+ # apply inverse of Logit function to obtain probability
368
+ probability = np.exp(logit_model) / (1.0 + np.exp(logit_model))
369
+
370
+ return probability
timbral_models/Timbral_Roughness.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from . import timbral_util
5
+
6
+
7
+ def plomp(f1, f2):
8
+ """
9
+ Plomp's algorithm for estimating roughness.
10
+
11
+ :param f1: float, frequency of first frequency of the pair
12
+ :param f2: float, frequency of second frequency of the pair
13
+ :return:
14
+ """
15
+ b1 = 3.51
16
+ b2 = 5.75
17
+ xstar = 0.24
18
+ s1 = 0.0207
19
+ s2 = 18.96
20
+ s = np.tril(xstar / ((s1 * np.minimum(f1, f2)) + s2))
21
+ pd = np.exp(-b1 * s * np.abs(f2 - f1)) - np.exp(-b2 * s * np.abs(f2 - f1))
22
+ return pd
23
+
24
+
25
+ def timbral_roughness(fname, dev_output=False, phase_correction=False, clip_output=False, fs=0, peak_picking_threshold=0.01):
26
+ """
27
+ This function is an implementation of the Vassilakis [2007] model of roughness.
28
+ The peak picking algorithm implemented is based on the MIR toolbox's implementation.
29
+
30
+ This version of timbral_roughness contains self loudness normalising methods and can accept arrays as an input
31
+ instead of a string filename.
32
+
33
+ Version 0.4
34
+
35
+
36
+ Vassilakis, P. 'SRA: A Aeb-based researh tool for spectral and roughness analysis of sound signals', Proceedings
37
+ of the 4th Sound and Music Computing Conference, Lefkada, Greece, July, 2007.
38
+
39
+ Required parameter
40
+ :param fname: string, Audio filename to be analysed, including full file path and extension.
41
+
42
+ Optional parameters
43
+ :param dev_output: bool, when False return the roughness, when True return all extracted features
44
+ (current none).
45
+ :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
46
+ Defaults to False.
47
+
48
+ :return: Roughness of the audio signal.
49
+
50
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
51
+
52
+ Licensed under the Apache License, Version 2.0 (the "License");
53
+ you may not use this file except in compliance with the License.
54
+ You may obtain a copy of the License at
55
+
56
+ http://www.apache.org/licenses/LICENSE-2.0
57
+
58
+ Unless required by applicable law or agreed to in writing, software
59
+ distributed under the License is distributed on an "AS IS" BASIS,
60
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
61
+ See the License for the specific language governing permissions and
62
+ limitations under the License.
63
+ """
64
+ '''
65
+ Read input
66
+ '''
67
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
68
+
69
+ '''
70
+ Pad audio
71
+ '''
72
+ # pad audio
73
+ audio_samples = np.lib.pad(audio_samples, (512, 0), 'constant', constant_values=(0.0, 0.0))
74
+
75
+ '''
76
+ Reshape audio into time windows of 50ms.
77
+ '''
78
+ # reshape audio
79
+ audio_len = len(audio_samples)
80
+ time_step = 0.05
81
+ step_samples = int(fs * time_step)
82
+ nfft = step_samples
83
+ window = np.hamming(nfft + 2)
84
+ window = window[1:-1]
85
+ olap = nfft / 2
86
+ num_frames = int((audio_len)/(step_samples-olap))
87
+ next_pow_2 = np.log(step_samples) / np.log(2)
88
+ next_pow_2 = 2 ** int(next_pow_2 + 1)
89
+
90
+ reshaped_audio = np.zeros([next_pow_2, num_frames])
91
+
92
+ i = 0
93
+ start_idx = int((i * (nfft / 2.0)))
94
+
95
+ # check if audio is too short to be reshaped
96
+ if audio_len > step_samples:
97
+ # get all the audio
98
+ while start_idx+step_samples <= audio_len:
99
+ audio_frame = audio_samples[start_idx:start_idx+step_samples]
100
+
101
+ # apply window
102
+ audio_frame = audio_frame * window
103
+
104
+ # append zeros
105
+ reshaped_audio[:step_samples, i] = audio_frame
106
+
107
+ # increase the step
108
+ i += 1
109
+ start_idx = int((i * (nfft / 2.0)))
110
+ else:
111
+ # reshaped audio is just padded audio samples
112
+ reshaped_audio[:audio_len, i] = audio_samples
113
+
114
+ spec = np.fft.fft(reshaped_audio, axis=0)
115
+ spec_len = int(next_pow_2/2) + 1
116
+ spec = spec[:spec_len, :]
117
+ spec = np.absolute(spec)
118
+
119
+ freq = fs/2 * np.linspace(0, 1, spec_len)
120
+
121
+ # normalise spectrogram based from peak TF bin
122
+ norm_spec = (spec - np.min(spec)) / (np.max(spec) - np.min(spec))
123
+
124
+ ''' Peak picking algorithm '''
125
+ cthr = peak_picking_threshold # threshold for peak picking
126
+
127
+ _, no_segments = np.shape(spec)
128
+
129
+ allpeakpos = []
130
+ allpeaklevel = []
131
+ allpeaktime = []
132
+
133
+ for i in range(0, no_segments):
134
+ d = norm_spec[:, i]
135
+ d_un = spec[:, i]
136
+
137
+ # find peak candidates
138
+ peak_pos, peak_level, peak_x = timbral_util.detect_peaks(d, cthr=cthr, unprocessed_array=d_un, freq=freq)
139
+
140
+ allpeakpos.append(peak_pos)
141
+ allpeaklevel.append(peak_level)
142
+ allpeaktime.append(peak_x)
143
+
144
+ ''' Calculate the Vasillakis Roughness '''
145
+ allroughness = []
146
+ # for each frame
147
+ for frame in range(len(allpeaklevel)):
148
+ frame_freq = allpeaktime[frame]
149
+ frame_level = allpeaklevel[frame]
150
+
151
+ if len(frame_freq) > 1:
152
+ f2 = np.kron(np.ones([len(frame_freq), 1]), frame_freq)
153
+ f1 = f2.T
154
+ v2 = np.kron(np.ones([len(frame_level), 1]), frame_level)
155
+ v1 = v2.T
156
+
157
+ X = v1 * v2
158
+ Y = (2 * v2) / (v1 + v2)
159
+ Z = plomp(f1, f2)
160
+ rough = (X ** 0.1) * (0.5 * (Y ** 3.11)) * Z
161
+
162
+ allroughness.append(np.sum(rough))
163
+ else:
164
+ allroughness.append(0)
165
+
166
+ mean_roughness = np.mean(allroughness)
167
+
168
+ if dev_output:
169
+ return [mean_roughness]
170
+ else:
171
+ '''
172
+ Perform linear regression
173
+ '''
174
+ # cap roughness for low end
175
+ if mean_roughness < 0.01:
176
+ return 0
177
+ else:
178
+ roughness = np.log10(mean_roughness) * 13.98779569 + 48.97606571545886
179
+ if clip_output:
180
+ roughness = timbral_util.output_clip(roughness)
181
+
182
+ return roughness
183
+
184
+
185
+
timbral_models/Timbral_Sharpness.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from . import timbral_util
5
+
6
+
7
+ def sharpness_Fastl(loudspec):
8
+ """
9
+ Calculates the sharpness based on FASTL (1991)
10
+ Expression for weighting function obtained by fitting an
11
+ equation to data given in 'Psychoacoustics: Facts and Models'
12
+ using MATLAB basic fitting function
13
+ Original Matlab code by Claire Churchill Sep 2004
14
+ Transcoded by Andy Pearce 2018
15
+ """
16
+ n = len(loudspec)
17
+ gz = np.ones(140)
18
+ z = np.arange(141,n+1)
19
+ gzz = 0.00012 * (z/10.0) ** 4 - 0.0056 * (z/10.0) ** 3 + 0.1 * (z/10.0) ** 2 -0.81 * (z/10.0) + 3.5
20
+ gz = np.concatenate((gz, gzz))
21
+ z = np.arange(0.1, n/10.0+0.1, 0.1)
22
+
23
+ sharp = 0.11 * np.sum(loudspec * gz * z * 0.1) / np.sum(loudspec * 0.1)
24
+ return sharp
25
+
26
+
27
+ def timbral_sharpness(fname, dev_output=False, phase_correction=False, clip_output=False, fs=0):
28
+ """
29
+ This is an implementation of the matlab sharpness function found at:
30
+ https://www.salford.ac.uk/research/sirc/research-groups/acoustics/psychoacoustics/sound-quality-making-products-sound-better/accordion/sound-quality-testing/matlab-codes
31
+
32
+ This function calculates the apparent Sharpness of an audio file.
33
+ This version of timbral_sharpness contains self loudness normalising methods and can accept arrays as an input
34
+ instead of a string filename.
35
+
36
+ Version 0.4
37
+
38
+ Originally coded by Claire Churchill Sep 2004
39
+ Transcoded by Andy Pearce 2018
40
+
41
+ Required parameter
42
+ :param fname: string, audio filename to be analysed, including full file path and extension.
43
+
44
+ Optional parameters
45
+ :param dev_output: bool, when False return the warmth, when True return all extracted features
46
+ :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
47
+ Defaults to False.
48
+ :param clip_output: bool, bool, force the output to be between 0 and 100. Defaults to False.
49
+
50
+ :return Apparent sharpness of the audio file.
51
+
52
+
53
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
54
+
55
+ Licensed under the Apache License, Version 2.0 (the "License");
56
+ you may not use this file except in compliance with the License.
57
+ You may obtain a copy of the License at
58
+
59
+ http://www.apache.org/licenses/LICENSE-2.0
60
+
61
+ Unless required by applicable law or agreed to in writing, software
62
+ distributed under the License is distributed on an "AS IS" BASIS,
63
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
64
+ See the License for the specific language governing permissions and
65
+ limitations under the License.
66
+
67
+ """
68
+ '''
69
+ Read input
70
+ '''
71
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
72
+
73
+ # window the audio file into 4096 sample sections
74
+ windowed_audio = timbral_util.window_audio(audio_samples, window_length=4096)
75
+
76
+ windowed_sharpness = []
77
+ windowed_rms = []
78
+ for i in range(windowed_audio.shape[0]):
79
+ samples = windowed_audio[i, :]
80
+
81
+ # calculate the rms and append to list
82
+ windowed_rms.append(np.sqrt(np.mean(samples * samples)))
83
+
84
+ # calculate the specific loudness
85
+ N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
86
+
87
+ # calculate the sharpness if section contains audio
88
+ if N_entire > 0:
89
+ sharpness = sharpness_Fastl(N_single)
90
+ else:
91
+ sharpness = 0
92
+
93
+ windowed_sharpness.append(sharpness)
94
+
95
+ # convert lists to numpy arrays for fancy indexing
96
+ windowed_rms = np.array(windowed_rms)
97
+ windowed_sharpness = np.array(windowed_sharpness)
98
+ # calculate the sharpness as the rms-weighted average of sharpness
99
+ rms_sharpness = np.average(windowed_sharpness, weights=(windowed_rms * windowed_rms))
100
+
101
+ # take the logarithm to better much subjective ratings
102
+ rms_sharpness = np.log10(rms_sharpness)
103
+
104
+ if dev_output:
105
+ return [rms_sharpness]
106
+ else:
107
+
108
+ all_metrics = np.ones(2)
109
+ all_metrics[0] = rms_sharpness
110
+
111
+ # coefficients from linear regression
112
+ coefficients = [102.50508921364404, 34.432655185001735]
113
+
114
+ # apply regression
115
+ sharpness = np.sum(all_metrics * coefficients)
116
+
117
+ if clip_output:
118
+ sharpness = timbral_util.output_clip(sharpness)
119
+
120
+ return sharpness
timbral_models/Timbral_Warmth.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from scipy.signal import spectrogram
5
+ import scipy.stats
6
+ from sklearn import linear_model
7
+ from . import timbral_util
8
+
9
+
10
+ def warm_region_cal(audio_samples, fs):
11
+ """
12
+ Function for calculating various warmth parameters.
13
+
14
+ :param audio_samples: numpy.array, an array of the audio samples, reques only one dimension.
15
+ :param fs: int, the sample ratr of the audio file.
16
+
17
+ :return: four outputs: mean warmth region, weighted-average warmth region, mean high frequency level,
18
+ weighted-average high frequency level.
19
+ """
20
+ #window the audio
21
+ windowed_samples = timbral_util.window_audio(audio_samples)
22
+
23
+ # need to define a function for the roughness stimuli, emphasising the 20 - 40 region (of the bark scale)
24
+ min_bark_band = 10
25
+ max_bark_band = 40
26
+ mean_bark_band = (min_bark_band + max_bark_band) / 2.0
27
+ array = np.arange(min_bark_band, max_bark_band)
28
+ x = timbral_util.normal_dist(array, theta=0.01, mean=mean_bark_band)
29
+ x -= np.min(x)
30
+ x /= np.max(x)
31
+
32
+ wr_array = np.zeros(240)
33
+ wr_array[min_bark_band:max_bark_band] = x
34
+
35
+ # need to define a second array emphasising the 20 - 40 region (of the bark scale)
36
+ min_bark_band = 80
37
+ max_bark_band = 240
38
+ mean_bark_band = (min_bark_band + max_bark_band) / 2.0
39
+ array = np.arange(min_bark_band, max_bark_band)
40
+ x = timbral_util.normal_dist(array, theta=0.01, mean=mean_bark_band)
41
+ x -= np.min(x)
42
+ x /= np.max(x)
43
+
44
+ hf_array = np.zeros(240)
45
+ hf_array[min_bark_band:max_bark_band] = x
46
+
47
+ windowed_loud_spec = []
48
+ windowed_rms = []
49
+
50
+ wr_vals = []
51
+ hf_vals = []
52
+
53
+ for i in range(windowed_samples.shape[0]):
54
+ samples = windowed_samples[i, :]
55
+ N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
56
+
57
+ # append the loudness spec
58
+ windowed_loud_spec.append(N_single)
59
+ windowed_rms.append(np.sqrt(np.mean(samples * samples)))
60
+
61
+ wr_vals.append(np.sum(wr_array * N_single))
62
+ hf_vals.append(np.sum(hf_array * N_single))
63
+
64
+ mean_wr = np.mean(wr_vals)
65
+ mean_hf = np.mean(hf_vals)
66
+ weighted_wr = np.average(wr_vals, weights=windowed_rms)
67
+ weighted_hf = np.average(hf_vals, weights=windowed_rms)
68
+
69
+ return mean_wr, weighted_wr, mean_hf, weighted_hf
70
+
71
+
72
+ def timbral_warmth(fname, dev_output=False, phase_correction=False, clip_output=False, max_FFT_frame_size=8192,
73
+ max_WR = 12000, fs=0):
74
+ """
75
+ This function estimates the perceptual Warmth of an audio file.
76
+
77
+ This model of timbral_warmth contains self loudness normalising methods and can accept arrays as an input
78
+ instead of a string filename.
79
+
80
+ Version 0.4
81
+
82
+ Required parameter
83
+ :param fname: string, Audio filename to be analysed, including full file path and extension.
84
+
85
+ Optional parameters
86
+ :param dev_output: bool, when False return the warmth, when True return all extracted features in a
87
+ list.
88
+ :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum.
89
+ Defaults to False.
90
+ :param max_FFT_frame_size: int, Frame size for calculating spectrogram, default to 8192.
91
+ :param max_WR: float, maximun allowable warmth region frequency, defaults to 12000.
92
+
93
+ :return: Estimated warmth of audio file.
94
+
95
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
96
+
97
+ Licensed under the Apache License, Version 2.0 (the "License");
98
+ you may not use this file except in compliance with the License.
99
+ You may obtain a copy of the License at
100
+
101
+ http://www.apache.org/licenses/LICENSE-2.0
102
+
103
+ Unless required by applicable law or agreed to in writing, software
104
+ distributed under the License is distributed on an "AS IS" BASIS,
105
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
106
+ See the License for the specific language governing permissions and
107
+ limitations under the License.
108
+
109
+ """
110
+ '''
111
+ Read input
112
+ '''
113
+ audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction)
114
+
115
+ # get the weighted high frequency content
116
+ mean_wr, _, _, weighted_hf = warm_region_cal(audio_samples, fs)
117
+
118
+ # calculate the onsets
119
+ envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.1)
120
+ envelope_time = np.arange(len(envelope)) / float(fs)
121
+
122
+ # calculate the onsets
123
+ nperseg = 4096
124
+ original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg)
125
+ # If onsets don't exist, set it to time zero
126
+ if not original_onsets:
127
+ original_onsets = [0]
128
+ # set to start of file in the case where there is only one onset
129
+ if len(original_onsets) == 1:
130
+ original_onsets = [0]
131
+ '''
132
+ Initialise lists for storing features
133
+ '''
134
+ # set defaults for holding
135
+ all_rms = []
136
+ all_ratio = []
137
+ all_SC = []
138
+ all_WR_Ratio = []
139
+ all_decay_score = []
140
+
141
+
142
+ # calculate metrics for each onset
143
+ for idx, onset in enumerate(original_onsets):
144
+ if onset == original_onsets[-1]:
145
+ # this is the last onset
146
+ segment = audio_samples[onset:]
147
+ else:
148
+ segment = audio_samples[onset:original_onsets[idx+1]]
149
+
150
+ segment_rms = np.sqrt(np.mean(segment * segment))
151
+ all_rms.append(segment_rms)
152
+
153
+ # get FFT of signal
154
+ segment_length = len(segment)
155
+ if segment_length < max_FFT_frame_size:
156
+ freq, time, spec = spectrogram(segment, fs, nperseg=segment_length, nfft=max_FFT_frame_size)
157
+ else:
158
+ freq, time, spec = spectrogram(segment, fs, nperseg=max_FFT_frame_size, nfft=max_FFT_frame_size)
159
+
160
+ # flatten the audio to 1 dimension. Catches some strange errors that cause crashes
161
+ if spec.shape[1] > 1:
162
+ spec = np.sum(spec, axis=1)
163
+ spec = spec.flatten()
164
+
165
+ # normalise for this onset
166
+ spec = np.array(list(spec)).flatten()
167
+ this_shape = spec.shape
168
+ spec /= max(abs(spec))
169
+
170
+ '''
171
+ Estimate of fundamental frequency
172
+ '''
173
+ # peak picking algorithm
174
+ peak_idx, peak_value, peak_x = timbral_util.detect_peaks(spec, freq=freq, fs=fs)
175
+ # find lowest peak
176
+ fundamental = np.min(peak_x)
177
+ fundamental_idx = np.min(peak_idx)
178
+
179
+ '''
180
+ Warmth region calculation
181
+ '''
182
+ # estimate the Warmth region
183
+ WR_upper_f_limit = fundamental * 3.5
184
+ if WR_upper_f_limit > max_WR:
185
+ WR_upper_f_limit = 12000
186
+ tpower = np.sum(spec)
187
+ WR_upper_f_limit_idx = int(np.where(freq > WR_upper_f_limit)[0][0])
188
+
189
+ if fundamental < 260:
190
+ # find frequency bin closest to 260Hz
191
+ top_level_idx = int(np.where(freq > 260)[0][0])
192
+ # sum energy up to this bin
193
+ low_energy = np.sum(spec[fundamental_idx:top_level_idx])
194
+ # sum all energy
195
+ tpower = np.sum(spec)
196
+ # take ratio
197
+ ratio = low_energy / float(tpower)
198
+ else:
199
+ # make exception where fundamental is greater than
200
+ ratio = 0
201
+
202
+ all_ratio.append(ratio)
203
+
204
+ '''
205
+ Spectral centroid of the segment
206
+ '''
207
+ # spectral centroid
208
+ top = np.sum(freq * spec)
209
+ bottom = float(np.sum(spec))
210
+ SC = np.sum(freq * spec) / float(np.sum(spec))
211
+ all_SC.append(SC)
212
+
213
+ '''
214
+ HF decay
215
+ - linear regression of the values above the warmth region
216
+ '''
217
+ above_WR_spec = np.log10(spec[WR_upper_f_limit_idx:])
218
+ above_WR_freq = np.log10(freq[WR_upper_f_limit_idx:])
219
+ np.ones_like(above_WR_freq)
220
+ metrics = np.array([above_WR_freq, np.ones_like(above_WR_freq)])
221
+
222
+ # create a linear regression model
223
+ model = linear_model.LinearRegression(fit_intercept=False)
224
+ model.fit(metrics.transpose(), above_WR_spec)
225
+ decay_score = model.score(metrics.transpose(), above_WR_spec)
226
+ all_decay_score.append(decay_score)
227
+
228
+
229
+ '''
230
+ get mean values
231
+ '''
232
+ mean_SC = np.log10(np.mean(all_SC))
233
+ mean_decay_score = np.mean(all_decay_score)
234
+ weighted_mean_ratio = np.average(all_ratio, weights=all_rms)
235
+
236
+ if dev_output:
237
+ return mean_SC, weighted_hf, mean_wr, mean_decay_score, weighted_mean_ratio
238
+ else:
239
+
240
+ '''
241
+ Apply regression model
242
+ '''
243
+ all_metrics = np.ones(6)
244
+ all_metrics[0] = mean_SC
245
+ all_metrics[1] = weighted_hf
246
+ all_metrics[2] = mean_wr
247
+ all_metrics[3] = mean_decay_score
248
+ all_metrics[4] = weighted_mean_ratio
249
+
250
+ coefficients = np.array([-4.464258317026696,
251
+ -0.08819320850778556,
252
+ 0.29156539973575546,
253
+ 17.274733561081554,
254
+ 8.403340066029507,
255
+ 45.21212125085579])
256
+
257
+ warmth = np.sum(all_metrics * coefficients)
258
+
259
+ # clip output between 0 and 100
260
+ if clip_output:
261
+ warmth = timbral_util.output_clip(warmth)
262
+
263
+ return warmth
timbral_models/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = '0.4.1'
2
+
3
+ from .Timbral_Brightness import timbral_brightness
4
+ from .Timbral_Depth import timbral_depth
5
+ from .Timbral_Hardness import timbral_hardness
6
+ from .Timbral_Roughness import timbral_roughness
7
+ from .Timbral_Warmth import timbral_warmth
8
+ from .Timbral_Sharpness import timbral_sharpness
9
+ from .Timbral_Booming import timbral_booming
10
+ from .Timbral_Reverb import timbral_reverb
11
+ from .Timbral_Extractor import timbral_extractor
12
+ from .timbral_util import *
timbral_models/__pycache__/Timbral_Booming.cpython-310.pyc ADDED
Binary file (5.03 kB). View file
 
timbral_models/__pycache__/Timbral_Brightness.cpython-310.pyc ADDED
Binary file (4.98 kB). View file
 
timbral_models/__pycache__/Timbral_Depth.cpython-310.pyc ADDED
Binary file (6.56 kB). View file
 
timbral_models/__pycache__/Timbral_Extractor.cpython-310.pyc ADDED
Binary file (3.71 kB). View file
 
timbral_models/__pycache__/Timbral_Hardness.cpython-310.pyc ADDED
Binary file (5.98 kB). View file
 
timbral_models/__pycache__/Timbral_Reverb.cpython-310.pyc ADDED
Binary file (8.36 kB). View file
 
timbral_models/__pycache__/Timbral_Roughness.cpython-310.pyc ADDED
Binary file (4.55 kB). View file
 
timbral_models/__pycache__/Timbral_Sharpness.cpython-310.pyc ADDED
Binary file (3.84 kB). View file
 
timbral_models/__pycache__/Timbral_Warmth.cpython-310.pyc ADDED
Binary file (5.86 kB). View file
 
timbral_models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (702 Bytes). View file
 
timbral_models/__pycache__/timbral_util.cpython-310.pyc ADDED
Binary file (41.7 kB). View file
 
timbral_models/timbral_util.py ADDED
@@ -0,0 +1,1816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division, print_function
2
+ import numpy as np
3
+ import librosa
4
+ import soundfile as sf
5
+ from scipy.signal import butter, lfilter, spectrogram
6
+ import scipy.stats
7
+ import pyloudnorm as pyln
8
+ import six
9
+
10
+ """
11
+ The timbral util is a collection of functions that can be accessed by the individual timbral models. These can be
12
+ used for extracting features or manipulating the audio that are useful to multiple attributes.
13
+
14
+ Version 0.4
15
+
16
+ Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK.
17
+
18
+ Licensed under the Apache License, Version 2.0 (the "License");
19
+ you may not use this file except in compliance with the License.
20
+ You may obtain a copy of the License at
21
+
22
+ http://www.apache.org/licenses/LICENSE-2.0
23
+
24
+ Unless required by applicable law or agreed to in writing, software
25
+ distributed under the License is distributed on an "AS IS" BASIS,
26
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27
+ See the License for the specific language governing permissions and
28
+ limitations under the License.
29
+ """
30
+
31
+
32
+ def db2mag(dB):
33
+ """
34
+ Converts from dB to linear magnitude.
35
+
36
+ :param dB: dB level to be converted.
37
+ :return: linear magnitude of the dB input.
38
+ """
39
+ mag = 10 ** (dB / 20.0)
40
+ return mag
41
+
42
+
43
+ def get_percussive_audio(audio_samples, return_ratio=True):
44
+ """
45
+ Gets the percussive comonent of the audio file.
46
+ Currently, the default values for harmonic/percussive decomposition have been used.
47
+ Future updates may change the defaults for better separation or to improve the correlation to subjective data.
48
+
49
+ :param audio_samples: The audio samples to be harmonicall/percussively separated
50
+ :param return_ratio: Determins the value returned by the function.
51
+
52
+ :return: If return_ratio is True (default), the ratio of percussive energy is returned.
53
+ If False, the function returns the percussive audio as a time domain array.
54
+ """
55
+ # use librosa decomposition
56
+ D = librosa.core.stft(audio_samples)
57
+ H, P = librosa.decompose.hpss(D)
58
+
59
+ # inverse transform to get time domain arrays
60
+ percussive_audio = librosa.core.istft(P)
61
+ harmonic_audio = librosa.core.istft(H)
62
+
63
+ if return_ratio:
64
+ # frame by frame RMS energy
65
+ percussive_energy = calculate_rms_enveope(percussive_audio, step_size=1024, overlap_step=512, normalise=False)
66
+ harmonic_energy = calculate_rms_enveope(harmonic_audio, step_size=1024, overlap_step=512, normalise=False)
67
+
68
+ # set defaults for storing the data
69
+ ratio = []
70
+ t_power = []
71
+
72
+ #get the ratio for each RMS time frame
73
+ for i in range(len(percussive_energy)):
74
+ if percussive_energy[i] != 0 or harmonic_energy[i] != 0:
75
+ # if percussive_energy[i] != 0 and harmonic_energy[i] != 0:
76
+ ratio.append(percussive_energy[i] / (percussive_energy[i] + harmonic_energy[i]))
77
+ t_power.append((percussive_energy[i] + harmonic_energy[i]))
78
+
79
+ if t_power:
80
+ # take a weighted average of the ratio
81
+ ratio = np.average(ratio, weights=t_power)
82
+ return ratio
83
+ else:
84
+ # return the percussive audio when return_ratio is False
85
+ return percussive_audio
86
+
87
+
88
+ def filter_audio_highpass(audio_samples, crossover, fs, order=2):
89
+ """ Calculate and apply a high-pass filter, with a -3dB point of crossover.
90
+
91
+ :param audio_samples: data to be filtered as an array.
92
+ :param crossover: the crossover frequency of the filter.
93
+ :param fs: the sampling frequency of the audio file.
94
+ :param order: order of the filter, defaults to 2.
95
+
96
+ :return: filtered array.
97
+ """
98
+ nyq = 0.5 * fs
99
+ xfreq = crossover / nyq
100
+ b, a = butter(order, xfreq, 'high')
101
+ y = lfilter(b, a, audio_samples)
102
+ return y
103
+
104
+
105
+ def filter_audio_lowpass(audio_samples, crossover, fs, order=2):
106
+ """ Calculate and apply a low-pass filter, with a -3dB point of crossover.
107
+
108
+ :param audio_samples: data to be filtered as an array.
109
+ :param crossover: the crossover frequency of the filter.
110
+ :param fs: the sampling frequency of the audio file.
111
+ :param order: order of the filter, defaults to 2.
112
+
113
+ :return: filtered array.
114
+ """
115
+ nyq = 0.5 * fs
116
+ xfreq = crossover / nyq
117
+ b, a = butter(order, xfreq, 'low')
118
+ y = lfilter(b, a, audio_samples)
119
+ return y
120
+
121
+
122
+ def butter_bandpass(lowcut, highcut, fs, order=2):
123
+ """ Design a butterworth bandpass filter """
124
+ nyq = 0.5 * fs
125
+ low = lowcut / nyq
126
+ high = highcut / nyq
127
+ b, a = butter(order, [low, high], btype='band')
128
+ return b, a
129
+
130
+
131
+ def filter_audio_bandpass(audio_samples, f0, noct, fs, order=2):
132
+ """ Calculate and apply an n/octave butterworth bandpass filter, centred at f0 Hz.
133
+
134
+ :param audio_samples: the audio file as an array
135
+ :param fs: the sampling frequency of the audio file
136
+ :param f0: the centre frequency of the bandpass filter
137
+ :param bandwidth: the bandwidth of the filter
138
+ :param order: order of the filter, defaults to 2
139
+
140
+ :return: audio file filtered
141
+ """
142
+ fd = 2 ** (1.0 / (noct * 2))
143
+ lowcut = f0 / fd
144
+ highcut = f0 * fd
145
+
146
+ b, a = butter_bandpass(lowcut, highcut, fs, order=order)
147
+ y = lfilter(b, a, audio_samples)
148
+ return y
149
+
150
+
151
+ def return_loop(onset_loc, envelope, function_time_thresh, hist_threshold, hist_time_samples, nperseg=512):
152
+ """ This function is used by the calculate_onsets method.
153
+ This looks backwards in time from the attack time and attempts to find the exact onset point by
154
+ identifying the point backwards in time where the envelope no longer falls.
155
+ This function includes a hyteresis to account for small deviations in the attack due to the
156
+ envelope calculation.
157
+
158
+ Function looks 10ms (function_time_thresh) backwards from the onset time (onset_loc), looking for any sample
159
+ lower than the current sample. This repeats, starting at the minimum value until no smaller value is found.
160
+ Then the function looks backwards over 200ms, checking if the increase is greater than 10% of the full envelope's
161
+ dynamic range.
162
+
163
+ onset_loc: The onset location estimated by librosa (converted to time domain index)
164
+ envelope: Envelope of the audio file
165
+ function_time_thresh: Time threshold for looking backwards in time. Set in the timbral_hardness code
166
+ to be the number of samples that equates to 10ms
167
+ hist_threshold: Level threshold to check over 200ms if the peak is small enough to continue looking
168
+ backwards in time.
169
+ hist_time_samples: Number of samples to look back after finding the minimum value over 10ms, set to 200ms.
170
+ """
171
+
172
+ # define flag for exiting while loop
173
+ found_start = False
174
+
175
+ while not found_start:
176
+ # get the current sample value
177
+ current_sample = envelope[int(onset_loc)]
178
+ # get the previous 10ms worth of samples
179
+ if onset_loc - function_time_thresh > 0:
180
+ evaluation_array = envelope[onset_loc - function_time_thresh - 1:onset_loc]
181
+ else:
182
+ evaluation_array = envelope[:onset_loc - 1]
183
+
184
+ if min(evaluation_array) - current_sample <= 0:
185
+ '''
186
+ If the minimum value within previous 10ms is less than current sample,
187
+ move to the start position to the minimum value and look again.
188
+ '''
189
+ min_idx = np.argmin(evaluation_array)
190
+ new_onset_loc = min_idx + onset_loc - function_time_thresh - 1
191
+
192
+ if new_onset_loc > nperseg:
193
+ onset_loc = new_onset_loc
194
+ else:
195
+ ''' Current index is close to start of the envelope, so exit with the idx as 512 '''
196
+ return 0
197
+
198
+ else:
199
+ '''
200
+ If the minimum value within previous 10ms is greater than current sample,
201
+ introduce the time and level hysteresis to check again.
202
+ '''
203
+ # get the array of 200ms previous to the current onset idx
204
+ if (onset_loc - hist_time_samples - 1) > 0:
205
+ hyst_evaluation_array = envelope[onset_loc - hist_time_samples - 1:onset_loc]
206
+ else:
207
+ hyst_evaluation_array = envelope[:onset_loc]
208
+
209
+ # values less than current sample
210
+ all_match = np.where(hyst_evaluation_array < envelope[onset_loc])
211
+
212
+ # if no minimum was found within the extended time, exit with current onset idx
213
+ if len(all_match[0]) == 0:
214
+ return onset_loc
215
+
216
+ # get the idx of the closest value which is lower than the current onset idx
217
+ last_min = all_match[0][-1]
218
+ last_idx = int(onset_loc - len(hyst_evaluation_array) + last_min)
219
+
220
+ # get the dynamic range of this segment
221
+ segment_dynamic_range = max(hyst_evaluation_array[last_min:]) - min(hyst_evaluation_array[last_min:])
222
+
223
+ # compare this dynamic range against the hyteresis threshold
224
+ if segment_dynamic_range >= hist_threshold:
225
+ '''
226
+ The dynamic range is greater than the threshold, therefore this is a separate audio event.
227
+ Return the current onset idx.
228
+ '''
229
+ return onset_loc
230
+ else:
231
+ '''
232
+ The dynamic range is less than the threshold, therefore this is not a separate audio event.
233
+ Set current onset idx to minimum value and repeat.
234
+ '''
235
+ if last_idx >= nperseg:
236
+ onset_loc = last_idx
237
+ else:
238
+ '''
239
+ The hysteresis check puts the new threshold too close to the start
240
+ '''
241
+ return 0
242
+
243
+
244
+ def sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.2, hold_time=0.01):
245
+ """
246
+ Calculates the envelope of audio_samples with a 'sample and hold' style function.
247
+ This ensures that the minimum attack time is not limited by low-pass filtering,
248
+ a common method of obtaining the envelope.
249
+
250
+ :param audio_samples: audio array
251
+ :param fs: sampling frequency
252
+ :param decay_time: decay time after peak hold
253
+ :param hold_time: hold time when identifying a decay
254
+
255
+ :return: envelope of audio_samples
256
+ """
257
+ # rectify the audio signal
258
+ abs_samples = abs(audio_samples)
259
+ envelope = []
260
+
261
+ # set parameters for envelope function
262
+ decay = max(abs_samples) / (decay_time * fs) # decay rate relative to peak level of audio signal
263
+ hold_samples = hold_time * fs # number of samples to hold before decay
264
+ hold_counter = 0
265
+ previous_sample = 0.0
266
+
267
+ # perform the sample, hold, and decay function to obtain envelope
268
+ for sample in abs_samples:
269
+ if sample >= previous_sample:
270
+ envelope.append(sample)
271
+ previous_sample = sample
272
+ hold_counter = 0
273
+ else:
274
+ # check hold length
275
+ if hold_counter < hold_samples:
276
+ hold_counter += 1
277
+ envelope.append(previous_sample)
278
+ else:
279
+ out = previous_sample - decay
280
+ if out > sample:
281
+ envelope.append(out)
282
+ previous_sample = out
283
+ else:
284
+ envelope.append(sample)
285
+ previous_sample = sample
286
+
287
+ # convert to numpy array
288
+ return np.array(envelope)
289
+
290
+
291
+ def get_spectral_features(audio, fs, lf_limit=20, scale='hz', cref=27.5, power=2, window_type='none',
292
+ rollon_thresh=0.05):
293
+ """
294
+ This function calculates the spectral centroid and spectral spread of an audio array.
295
+
296
+ :param audio: Audio array
297
+ :param fs: Sample rate of audio file
298
+ :param lf_limit: Low frequency limit, in Hz, to be analysed. Defaults to 20Hz.
299
+ :param scale: The frequency scale that calculations should be made over. if no argument is given, this
300
+ defaults to 'hz', representing a linear frequency scale. Options are 'hz', 'mel', 'erb',
301
+ or 'cents'.
302
+ :param cref: The reference frequency for calculating cents. Defaults to 27.5Hz.
303
+ :param power: The power to raise devaition from specteal centroid, defaults to 2.
304
+
305
+ :return: Returns the spectral centroid, spectral spread, and unitless centroid.
306
+ """
307
+ # use a hanning window
308
+ if window_type == 'hann':
309
+ window = np.hanning(len(audio))
310
+ elif window_type == 'none':
311
+ window = np.ones(len(audio))
312
+ else:
313
+ raise ValueError('Window type must be set to either \'hann\' or \'none\'')
314
+
315
+ next_pow_2 = int(pow(2, np.ceil(np.log2(len(window)))))
316
+ # get frequency domain representation
317
+ spectrum = np.fft.fft((window * audio), next_pow_2)
318
+ spectrum = np.absolute(spectrum[0:int(len(spectrum) / 2) + 1])
319
+
320
+ tpower = np.sum(spectrum)
321
+
322
+ if tpower > 0:
323
+ freq = np.arange(0, len(spectrum), 1) * (fs / (2.0 * (len(spectrum) - 1)))
324
+
325
+ # find lowest frequency index, zeros used to unpack result
326
+ lf_limit_idx = np.where(freq >= lf_limit)[0][0]
327
+ spectrum = spectrum[lf_limit_idx:]
328
+ freq = freq[lf_limit_idx:]
329
+
330
+ # convert frequency to desired frequency scale
331
+ if scale == 'hz':
332
+ freq = freq
333
+ elif scale == 'mel':
334
+ freq = 1127.0 * np.log(1 + (freq / 700.0))
335
+ elif scale == 'erb':
336
+ freq = 21.4 * np.log10(1 + (0.00437 * freq))
337
+ elif freq == 'cents':
338
+ # for cents, a
339
+ freq = 1200.0 * np.log2((freq / cref) + 1.0)
340
+ else:
341
+ raise ValueError('Frequency scale type not recognised. Please use \'hz\', \'mel\', \'erb\', or \'cents\'.')
342
+
343
+ # calculate centroid and spread
344
+ centroid = sum(spectrum * freq) / float(sum(spectrum))
345
+
346
+ # old calculation of spread
347
+ deviation = np.abs(freq - centroid)
348
+ spread = np.sqrt(np.sum((deviation ** 2) * spectrum) / np.sum(spectrum))
349
+
350
+ # new calculation of spread according to librosa
351
+ # spread = np.sqrt(np.sum(spectrum * (deviation ** power))) #** (1. / power))
352
+
353
+ cumulative_spectral_power = spectrum[0]
354
+ counter = 0
355
+ rollon_threshold = np.sum(spectrum) * rollon_thresh
356
+ while cumulative_spectral_power < rollon_threshold:
357
+ counter += 1
358
+ cumulative_spectral_power = np.sum(spectrum[:counter])
359
+
360
+ if counter == 0:
361
+ counter = 1
362
+
363
+ rollon_frequency = freq[counter]
364
+ unitless_centroid = centroid / rollon_frequency
365
+
366
+ return centroid, spread, unitless_centroid
367
+ else:
368
+ return 0
369
+
370
+
371
+ def calculate_attack_time(envelope_samples, fs, calculate_attack_segment=True, thresh_no=8, normalise=True, m=3,
372
+ calculation_type='min_effort', gradient_calulation_type='all', return_descriptive_data=False,
373
+ max_attack_time=-1):
374
+ """
375
+ Calculate the attack time from the envelope of a signal.
376
+
377
+ Required inputs
378
+ :param envelope_samples: envelope of the audio file, suggested to be calculated with
379
+ sample_and_hold_envelope_calculation.
380
+ :param fs: sample rate of the envelope_samples.
381
+
382
+ Optional inputs
383
+ :param calculate_attack_segment: If the attack segment of the onset should be calculated before estimating the
384
+ attack time. bool, default to True.
385
+ :param thresh_no: Number of thresholds used for calculating the minimum effort method.
386
+ int, default to 8.
387
+ :param m: value used for computation of minimum effort thresholds, defaults to 3 as s
388
+ uggested in the CUIDADO project.
389
+ :param calculation_type: method for calculating the attack time, options are 'min_effort' or
390
+ 'fixed_threshold', default to 'min_effort'.
391
+ :param gradient_calulation_type: Method for calculating the gradient of the attack, options are 'all' for
392
+ calculating the gradient from the estimated start and end points, or 'mean' for
393
+ calculating the mean gradient between each threshold step in the minimum effort
394
+ method. Defaults to 'all' and will revert to 'all' if mean is not available.
395
+ :param normalise: Normalise the attack segment. bool, default to True.
396
+ :param return_descriptive_data Default to False, if set to True also returns the thresholds for calculating
397
+ the min_effort method.
398
+ :param max_attack_time: sets the maximum allowable attack time. Defaults to -1, indicating that there
399
+ is no maximum attack time. This value should be set in seconds.
400
+
401
+ :return: returns the attack_time, attack_gradient, index of the attack start, and
402
+ temporal centroid.
403
+ """
404
+ if normalise:
405
+ # normalise the segments
406
+ normalise_factor = float(max(envelope_samples))
407
+ envelope_samples /= normalise_factor
408
+
409
+ if calculate_attack_segment:
410
+ # identify pre-attack segment
411
+ peak_idx = np.argmax(envelope_samples)
412
+ if peak_idx == 0:
413
+ # exit on error
414
+ return 0
415
+ # min_pre_peak_idx = np.argmin(envelope_samples[:peak_idx])
416
+ min_pre_peak_idx = np.where(envelope_samples[:peak_idx] == min(envelope_samples[:peak_idx]))[-1][-1]
417
+
418
+ # redefine the envelope samples as just the min to the peak
419
+ envelope_samples = envelope_samples[min_pre_peak_idx:peak_idx + 1]
420
+ else:
421
+ min_pre_peak_idx = 0
422
+
423
+ # calculate the appropriate start and end of the attack using the selected method
424
+ if calculation_type == 'min_effort':
425
+ # get threshold time array
426
+ threshold_step = 1.0 / (thresh_no + 2) # +2 is to ignore the 0 and 100% levels.
427
+ dyn_range = max(envelope_samples) - min(envelope_samples)
428
+ thresh_level = np.linspace(threshold_step, (1 - threshold_step), thresh_no + 1)
429
+ thresh_level = (thresh_level * dyn_range) + min(envelope_samples)
430
+
431
+ # predefine an array for when each threshold is crossed
432
+ threshold_idxs = np.zeros(thresh_no + 1)
433
+
434
+ # get indexes for when threshold is crossed
435
+ for j in range(len(thresh_level)):
436
+ threshold_hold = np.argmax(envelope_samples >= thresh_level[j])
437
+ # threshold_idxs[j] = threshold_hold + min_pre_peak_idx
438
+ threshold_idxs[j] = threshold_hold
439
+
440
+ # calculate effort values (distances between thresholds)
441
+ effort = np.diff(threshold_idxs)
442
+
443
+ # get the mean effort value
444
+ effort_mean = np.mean(effort)
445
+ effort_threshold = effort_mean * m
446
+
447
+ # find start and stop times foxr the attack
448
+ th_start = np.argmax(effort <= effort_threshold)
449
+
450
+ # need to use remaining effort values
451
+ effort_hold = effort[th_start:]
452
+ th_end = np.argmax(effort_hold >= effort_threshold) # this returns a 0 if value not found
453
+ if th_end == 0:
454
+ th_end = len(effort_hold) - 1 # make equal to the last value
455
+
456
+ # apply correction for holding the values
457
+ th_end = th_end + th_start
458
+
459
+ # get the actual start and stop index
460
+ th_start_idx = threshold_idxs[th_start]
461
+ th_end_idx = threshold_idxs[th_end]
462
+
463
+ if th_start_idx == th_end_idx:
464
+ th_start_idx = threshold_idxs[0]
465
+ th_end_idx = threshold_idxs[-1]
466
+
467
+ if th_start_idx == th_end_idx:
468
+ attack_time = 1.0 / fs
469
+ else:
470
+ attack_time = (th_end_idx - th_start_idx + 1.0) / fs
471
+
472
+ if max_attack_time > 0:
473
+ if attack_time > max_attack_time:
474
+ # how many samples is equivalent to the maximum?
475
+ max_attack_time_sample = int(fs * max_attack_time) # convert to integer
476
+ th_end_idx = th_start_idx + max_attack_time_sample
477
+ attack_time = (th_end_idx - th_start_idx + 1.0) / fs
478
+
479
+ start_level = envelope_samples[int(th_start_idx)]
480
+ end_level = envelope_samples[int(th_end_idx)]
481
+
482
+ # specify exceptions for a step functions crossing both thresholds
483
+ if start_level == end_level:
484
+ if th_start_idx > 0:
485
+ # if a previous sample is avaiable, take the previous starting sample
486
+ start_level = envelope_samples[int(th_start_idx) - 1]
487
+ else:
488
+ # set start level to zero if onset is at the first sample (indicating a step function at time zero)
489
+ start_level = 0.0
490
+
491
+ # is there enough data to calculate the mean
492
+ if gradient_calulation_type == 'mean':
493
+ if (end_level - start_level) < 0.2 or (th_end_idx - th_start_idx) < 2:
494
+ # force calculation type to all
495
+ gradient_calulation_type = 'all'
496
+ print('unable to calculate attack gradient with the \'mean\' method, reverting to \'all\' method.')
497
+
498
+ if gradient_calulation_type == 'mean':
499
+ # calculate the gradient based on the weighted mean of each attack
500
+ threshold_step = dyn_range / (thresh_no + 2)
501
+
502
+ gradient_thresh_array = np.arange(start_level, end_level + (threshold_step * dyn_range),
503
+ (threshold_step * dyn_range))
504
+ cross_threshold_times = np.zeros(len(gradient_thresh_array))
505
+ cross_threshold_values = np.zeros(len(gradient_thresh_array))
506
+ gradient_envelope_segment = envelope_samples[th_start_idx:th_end_idx + 1]
507
+
508
+ for i in range(len(cross_threshold_values)):
509
+ hold = np.argmax(gradient_envelope_segment >= gradient_thresh_array[i])
510
+ cross_threshold_times[i] = hold[0] / float(fs)
511
+ cross_threshold_values[i] = gradient_envelope_segment[hold[0]]
512
+
513
+ pente_v = np.diff(cross_threshold_values) / np.diff(cross_threshold_times)
514
+
515
+ # calculate weighted average of all gradients with a gausian dsitribution
516
+ m_threshold = 0.5 * (gradient_thresh_array[:-1] + gradient_thresh_array[1:])
517
+ weight_v = np.exp(-(m_threshold - 0.5) ** 2 / (0.5 ** 2))
518
+
519
+ attack_gradient = np.sum(pente_v * weight_v) / np.sum(weight_v)
520
+
521
+ elif gradient_calulation_type == 'all':
522
+ # calculate the attack gradient from th_start_idx to th_end_idx
523
+ attack_gradient = (end_level - start_level) / attack_time
524
+
525
+ '''
526
+ More stuff to return if we want extra information to be displayed
527
+ '''
528
+ thresholds_to_return = [calculation_type, th_start_idx + min_pre_peak_idx, th_end_idx + min_pre_peak_idx,
529
+ threshold_idxs + min_pre_peak_idx]
530
+
531
+ elif calculation_type == 'fixed_threshold':
532
+ # set threshold values for fixed threshold method
533
+ fixed_threshold_start = 20
534
+ fixed_threshold_end = 90
535
+
536
+ # get dynamic range
537
+ dyn_range = max(envelope_samples) - min(envelope_samples)
538
+
539
+ # get thresholds relative to envelope level
540
+ lower_threshold = (fixed_threshold_start * dyn_range * 0.01) + min(envelope_samples)
541
+ upper_threshold = (fixed_threshold_end * dyn_range * 0.01) + min(envelope_samples)
542
+
543
+ # calculate start index
544
+ th_start_idx = np.argmax(envelope_samples >= lower_threshold)
545
+ # th_start_idx = th_start_idx[0]
546
+
547
+ # find the end idx after the start idx
548
+ th_end_idx = np.argmax(envelope_samples[th_start_idx:] >= upper_threshold)
549
+ th_end_idx = th_end_idx + th_start_idx
550
+
551
+ if th_start_idx == th_end_idx:
552
+ attack_time = 1.0 / fs
553
+ else:
554
+ attack_time = (th_end_idx - th_start_idx + 1.0) / fs
555
+
556
+ # compare attack time to maximum permissible attack time
557
+ if max_attack_time > 0:
558
+ if attack_time > max_attack_time:
559
+ # how many samples is equivalent to the maximum?
560
+ max_attack_time_sample = int(fs * max_attack_time) # convert to integer
561
+ th_end_idx = th_start_idx + max_attack_time_sample
562
+ attack_time = (th_end_idx - th_start_idx + 1.0) / fs
563
+
564
+ # calculate the gradient
565
+
566
+ # find the level of the first sample used
567
+ start_level = envelope_samples[int(th_start_idx)]
568
+ # find the level of the last sample used
569
+ end_level = envelope_samples[int(th_end_idx)]
570
+
571
+ # specify exceptions for a step functions crossing both thresholds
572
+ if start_level == end_level:
573
+ if th_start_idx > 0:
574
+ # if a previous sample is avaiable, take the previous starting sample
575
+ start_level = envelope_samples[int(th_start_idx) - 1]
576
+ else:
577
+ # set start level to zero if onset is at the first sample (indicating a step function at time zero)
578
+ start_level = 0.0
579
+
580
+ attack_gradient = (end_level - start_level) / attack_time
581
+
582
+ '''
583
+ More details to be returned if desired
584
+ '''
585
+ thresholds_to_return = [calculation_type, th_start_idx + min_pre_peak_idx, th_end_idx + min_pre_peak_idx]
586
+
587
+ else:
588
+ raise ValueError('calculation_type must be set to either \'fixed_threshold\' or \'min_effort\'.')
589
+
590
+ # convert attack time to logarithmic scale
591
+ attack_time = np.log10(attack_time)
592
+
593
+ # revert attack gradient metric if envelope has been normalised
594
+ if normalise:
595
+ attack_gradient *= normalise_factor
596
+
597
+ '''
598
+ Calculate the temporal centroid
599
+ '''
600
+ hold_env = envelope_samples[int(th_start_idx):int(th_end_idx) + 1]
601
+ t = np.arange(0, len(hold_env)) / float(fs)
602
+ temp_centroid = np.sum(t * hold_env) / np.sum(hold_env)
603
+ temp_centroid /= float(len(hold_env))
604
+
605
+ if return_descriptive_data:
606
+ return attack_time, attack_gradient, int(th_start_idx + min_pre_peak_idx), temp_centroid, thresholds_to_return
607
+ else:
608
+ return attack_time, attack_gradient, int(th_start_idx + min_pre_peak_idx), temp_centroid
609
+
610
+
611
+ def calculate_onsets(audio_samples, envelope_samples, fs, look_back_time=20, hysteresis_time=300, hysteresis_percent=10,
612
+ onset_in_noise_threshold=10, minimum_onset_time_separation=100, nperseg=512):
613
+ """
614
+ Calculates the onset times using a look backwards recursive function to identify actual note onsets, and weights
615
+ the outputs based on the onset strength to avoid misidentifying onsets.
616
+
617
+ Required inputs
618
+ :param audio_samples: the audio file in the time domain.
619
+ :param envelope_samples: the envelope of the audio file, suggested to be calculated with
620
+ sample_and_hold_envelope_calculation.
621
+ :param fs: samplerate of the audio file. Function assumes the same sample rate for
622
+ both audio_samples and envelop_samples
623
+
624
+ Optional inputs
625
+ :param look_back_time: time in ms to recursively lookbackwards to identify start of onset,
626
+ defaults to 20ms.
627
+ :param hysteresis_time: time in ms to look backwards in time for a hysteresis check,
628
+ set to 300ms bedefault.
629
+ :param hysteresis_percent: set the percentage of dynamic range that must be checked when looking
630
+ backwards via hysteresis, default to 10%.
631
+ :param onset_in_noise_threshold: set a threshold of dynamic range for determining if an onset was variation
632
+ in noise or an actual onset, default to 10%.
633
+ :param minimum_onset_time_separation: set the minimum time in ms that two offsets can be separated by.
634
+ :param method: set the method for calculating the onsets. Default to 'librosa', but can
635
+ be 'essentia_hfc', or 'essentia_complex'.
636
+ :param nperseg: value used in return loop.
637
+
638
+ :return: thresholded onsets, returns [0] if no onsets are identified. Note that a
639
+ value of [0] is also possible during normal opperation.
640
+ """
641
+ # get onsets with librosa estimation
642
+ onsets = librosa.onset.onset_detect(y=audio_samples, sr=fs, backtrack=True, units='samples')
643
+
644
+ # set values for return_loop method
645
+ time_thresh = int(look_back_time * 0.001 * fs) # 10 ms default look-back time, in samples
646
+ hysteresis_samples = int(hysteresis_time * fs * 0.001) # hysteresis time, in samples
647
+ envelope_dyn_range = max(envelope_samples) - min(envelope_samples)
648
+ hysteresis_thresh = envelope_dyn_range * hysteresis_percent * 0.01
649
+
650
+ # only conduct analysis if there are onsets detected
651
+ if np.size(onsets):
652
+ # empty array for storing exact onset idxs
653
+ corrected_onsets = []
654
+
655
+ for onset_idx in onsets:
656
+ # if the onset is 1 or 0, it's too close to the start to be corrected (1 is here due to zero padding)
657
+ if onset_idx > 0:
658
+ # actual onset location in samples (librosa uses 512 window size by default)
659
+ onset_loc = np.array(onset_idx).astype('int')
660
+
661
+ # only calculate if the onset is NOT at the end of the file, whilst other onsets exist.
662
+ # If the only onset is at the end, calculate anyway.
663
+ if not corrected_onsets:
664
+ onset_hold = return_loop(onset_loc, envelope_samples, time_thresh, hysteresis_thresh,
665
+ hysteresis_samples, nperseg=nperseg)
666
+ corrected_onsets.append(onset_hold)
667
+ else:
668
+ if (onset_loc + 511) < len(envelope_samples):
669
+ onset_hold = return_loop(onset_loc, envelope_samples, time_thresh, hysteresis_thresh,
670
+ hysteresis_samples, nperseg=nperseg)
671
+ corrected_onsets.append(onset_hold)
672
+ else:
673
+ corrected_onsets.append(0)
674
+
675
+ # zero is returned from return_loop if no valid onset identified
676
+ # remove zeros (except the first)
677
+ zero_loc = np.where(np.array(corrected_onsets) == 0)[0]
678
+ # ignore if the first value is zero
679
+ if list(zero_loc):
680
+ if zero_loc[0] == 0:
681
+ zero_loc = zero_loc[1:]
682
+ corrected_onsets = np.delete(corrected_onsets, zero_loc)
683
+
684
+ # remove duplicates
685
+ hold_onsets = []
686
+ for i in corrected_onsets:
687
+ if i not in hold_onsets:
688
+ hold_onsets.append(i)
689
+ corrected_onsets = hold_onsets
690
+
691
+ '''
692
+ Remove repeated onsets and compare onset segments against the dynamic range
693
+ to remove erroneous onsets in noise. If the onset segment (samples between
694
+ adjacent onsets) has a dynamic range less than 10% of total dynamic range,
695
+ remove this onset.
696
+ '''
697
+ if len(corrected_onsets) > 1:
698
+ thd_corrected_onsets = []
699
+ last_value = corrected_onsets[-1]
700
+ threshold = onset_in_noise_threshold * envelope_dyn_range * 0.01
701
+
702
+ for i in reversed(range(len(corrected_onsets))):
703
+ if corrected_onsets[i] == corrected_onsets[-1]:
704
+ segment = envelope_samples[corrected_onsets[i]:]
705
+ else:
706
+ segment = envelope_samples[corrected_onsets[i]:corrected_onsets[i + 1]]
707
+
708
+ # only conduct if the segment if greater than 1 sample long
709
+ if len(segment) > 1:
710
+ # find attack portion SNR
711
+ peak_idx = np.argmax(segment)
712
+ if peak_idx > 0:
713
+ # get the dynamic range of the attack portion
714
+ seg_dyn_range = max(segment) - min(segment[:peak_idx])
715
+ if seg_dyn_range >= threshold:
716
+ pass
717
+ else:
718
+ corrected_onsets = np.delete(corrected_onsets, i)
719
+ else:
720
+ corrected_onsets = np.delete(corrected_onsets, i)
721
+ else:
722
+ corrected_onsets = np.delete(corrected_onsets, i)
723
+
724
+ # remove onsets that are too close together, favouring the earlier onset
725
+ if len(corrected_onsets) > 1:
726
+ minimum_onset_time_separation_samples = fs * 0.001 * minimum_onset_time_separation
727
+ time_separation = np.diff(corrected_onsets)
728
+ # while loop for potential multiple itterations
729
+ while len(corrected_onsets) > 1 and min(time_separation) < minimum_onset_time_separation_samples:
730
+ onsets_to_remove = []
731
+ # some onsets are closer together than the minimum value
732
+ for i in range(len(corrected_onsets)-1):
733
+ # are the last two onsets too close?
734
+ if abs(corrected_onsets[i+1] - corrected_onsets[i]) < minimum_onset_time_separation_samples:
735
+ onsets_to_remove.append(i+1)
736
+
737
+ # remove onsets too close together
738
+ corrected_onsets = np.delete(corrected_onsets, onsets_to_remove)
739
+ time_separation = np.diff(corrected_onsets)
740
+
741
+ '''
742
+ Correct onsets by comparing to the onset strength.
743
+
744
+ If there in an onset strength of 3 or greater between two onsets, then the onset if valid.
745
+ Otherwise, discard the onset.
746
+ '''
747
+ thd_corrected_onsets = []
748
+
749
+ # get the onset strength
750
+ onset_strength = librosa.onset.onset_strength(y=audio_samples, sr=fs)
751
+
752
+ strength_onset_times = np.array(np.array(corrected_onsets) / 512).astype('int')
753
+ strength_onset_times.clip(min=0)
754
+
755
+ corrected_original_onsets = []
756
+ corrected_strength_onsets = []
757
+ for onset_idx in reversed(range(len(corrected_onsets))):
758
+ current_strength_onset = strength_onset_times[onset_idx]
759
+ if current_strength_onset == strength_onset_times[-1]:
760
+ onset_strength_seg = onset_strength[current_strength_onset:]
761
+ else:
762
+ onset_strength_seg = onset_strength[current_strength_onset:strength_onset_times[onset_idx + 1]]
763
+
764
+ if max(onset_strength_seg) < 3:
765
+ strength_onset_times = np.delete(strength_onset_times, onset_idx)
766
+ else:
767
+ thd_corrected_onsets.append(corrected_onsets[onset_idx])
768
+
769
+ else:
770
+ return [0]
771
+
772
+ thd_corrected_onsets.sort()
773
+ if thd_corrected_onsets:
774
+ return thd_corrected_onsets
775
+ else:
776
+ return [0]
777
+
778
+
779
+ def get_bandwidth_array(audio_samples, fs, nperseg=512, overlap_step=32, rolloff_thresh=0.01,
780
+ rollon_thresh_percent=0.05, log_bandwidth=False, return_centroid=False,
781
+ low_bandwidth_method='Percentile', normalisation_method='RMS_Time_Window'):
782
+ """
783
+ Calculate the bandwidth array estimate for an audio signal.
784
+
785
+ Required inputs
786
+ :param audio_samples: array of the audio samples
787
+ :param fs: samplerate of the audio samples
788
+
789
+ Optional inputs
790
+ :param nperseg: numper of samples used for calculating spectrogram
791
+ :param overlap_step: number of samples overlap for calculating spectrogram
792
+ :param rolloff_thresh: threshold value for calculating rolloff frequency
793
+ :param rollon_thresh_percent: percentage threshold for calculating rollon frequency
794
+ :param log_bandwidth: return the logarithm of the bandwdith, default to False
795
+ :param return_centroid: return the centroid for each time window
796
+ :param low_bandwidth_method: method for calculating the low frequency limit of the bandwidth,
797
+ default to 'Percentile'
798
+ :param normalisation_method: method for normlaising the spectrogram, default to 'RMS_Time_Window'
799
+
800
+ :return: returns the bandwidth array, time array (from spectrogram), and
801
+ frequency array (from spectrogram).
802
+ """
803
+ noverlap = nperseg - overlap_step
804
+ # get spectrogram
805
+ f, t, spec = spectrogram(audio_samples, fs, window='boxcar', nperseg=nperseg, noverlap=noverlap, scaling='density',
806
+ mode='magnitude')
807
+
808
+ # normalise the spectrogram
809
+ if normalisation_method == 'Single_TF_Bin':
810
+ spec /= np.max(spec)
811
+ elif normalisation_method == 'RMS_Time_Window':
812
+ spec /= np.max(np.sqrt(np.sum(spec * spec, axis=0)))
813
+ elif normalisation_method == "none":
814
+ pass
815
+ else:
816
+ raise ValueError('Bandwidth normalisation method must be \'Single_TF_Bin\' or \'RMS_Time_Window\'')
817
+
818
+ # get values for thresholding
819
+ level_with_time = np.sum(spec, axis=0)
820
+ max_l = np.max(level_with_time)
821
+ min_l = np.min(level_with_time)
822
+ min_tpower = (0.1 * (max_l - min_l)) + min_l
823
+
824
+
825
+ # initialise lists for storage
826
+ rollon = []
827
+ rolloff = []
828
+ bandwidth = []
829
+ centroid = []
830
+ centroid_power = []
831
+
832
+ # calculate the bandwidth curve
833
+ for time_count in range(len(t)):
834
+ seg = spec[:, time_count]
835
+ tpower = np.sum(seg)
836
+ if tpower > min_tpower:
837
+ if low_bandwidth_method == 'Percentile':
838
+ # get the spectral rollon
839
+ rollon_counter = 1
840
+ cumulative_power = np.sum(seg[:rollon_counter])
841
+ rollon_thresh = tpower * rollon_thresh_percent
842
+
843
+ while cumulative_power < rollon_thresh:
844
+ rollon_counter += 1
845
+ cumulative_power = np.sum(seg[:rollon_counter])
846
+ rollon.append(f[rollon_counter - 1])
847
+ elif low_bandwidth_method == 'Cutoff':
848
+ rollon_idx = np.where(seg >= rolloff_thresh)[0]
849
+ if len(rollon_idx):
850
+ rollon_idx = rollon_idx[0]
851
+ rollon.append(f[rollon_idx])
852
+ else:
853
+ raise ValueError('low_bandwidth_method must be \'Percentile\' or \'Cutoff\'')
854
+
855
+ # get the spectral rolloff
856
+ rolloff_idx = np.where(seg >= rolloff_thresh)[0]
857
+ if len(rolloff_idx):
858
+ rolloff_idx = rolloff_idx[-1]
859
+ rolloff.append(f[rolloff_idx])
860
+ if log_bandwidth:
861
+ bandwidth.append(np.log(f[rolloff_idx] / float(f[rollon_counter - 1])))
862
+ else:
863
+ bandwidth.append(f[rolloff_idx] - f[rollon_counter - 1])
864
+ else:
865
+ bandwidth.append(0)
866
+
867
+ # get centroid values
868
+ centroid.append(np.sum(seg * f) / np.sum(seg))
869
+ centroid_power.append(tpower)
870
+ else:
871
+ bandwidth.append(0)
872
+
873
+ if return_centroid:
874
+ return bandwidth, t, f, np.average(centroid, weights=centroid_power)
875
+ else:
876
+ return bandwidth, t, f
877
+
878
+
879
+ def calculate_bandwidth_gradient(bandwidth_segment, t):
880
+ """
881
+ Calculate the gradient ferom the bandwidth array
882
+
883
+ :param bandwidth_segment: segment of bandwdith for calculation
884
+ :param t: time base for calculating
885
+
886
+ :return: gradient of the bandwidth
887
+ """
888
+ if bandwidth_segment:
889
+ max_idx = np.argmax(bandwidth_segment)
890
+ if max_idx > 0:
891
+ min_idx = np.where(np.array(bandwidth_segment[:max_idx]) == min(bandwidth_segment[:max_idx]))[0][-1]
892
+
893
+ bandwidth_change = bandwidth_segment[max_idx] - bandwidth_segment[min_idx]
894
+ time_to_change = (max_idx - min_idx) * (t[1] - t[0])
895
+
896
+ bandwidth_gradient = bandwidth_change / time_to_change
897
+ else:
898
+ bandwidth_gradient = False
899
+ else:
900
+ bandwidth_gradient = False
901
+ return bandwidth_gradient
902
+
903
+
904
+ def calculate_rms_enveope(audio_samples, step_size=256, overlap_step=256, normalise=True):
905
+ """
906
+ Calculate the RMS envelope of the audio signal.
907
+
908
+ :param audio_samples: numpy array, the audio samples.
909
+ :param step_size: int, number of samples to get the RMS from.
910
+ :param overlap_step: int, number of samples to overlap.
911
+
912
+ :return: RMS array
913
+ """
914
+ # initialise lists and counters
915
+ rms_envelope = []
916
+ i = 0
917
+ t_hold = []
918
+ # step through the signal
919
+ while i < len(audio_samples) - step_size:
920
+ rms_envelope.append(np.sqrt(np.mean(audio_samples[i:i + step_size] * audio_samples[i:i + step_size])))
921
+ i += overlap_step
922
+
923
+ # use the remainder of the array for a final sample
924
+ t_hold.append(i)
925
+ rms_envelope.append(np.sqrt(np.mean(audio_samples[i:] * audio_samples[i:])))
926
+ rms_envelope = np.array(rms_envelope)
927
+
928
+ # normalise to peak value
929
+ if normalise:
930
+ rms_envelope = rms_envelope * (1.0 / max(abs(rms_envelope)))
931
+
932
+ return rms_envelope
933
+
934
+
935
+ def detect_peaks(array, freq=0, cthr=0.2, unprocessed_array=False, fs=44100):
936
+ """
937
+ Function detects the peaks in array, based from the mirpeaks algorithm.
938
+
939
+ :param array: Array in which to detect peaks
940
+ :param freq: Scale representing the x axis (sample length as array)
941
+ :param cthr: Threshold for checking adjacent peaks
942
+ :param unprocessed_array: Array that in unprocessed (normalised), if False will default to the same as array.
943
+ :param fs: Sampe rate of the array
944
+
945
+ :return: index of peaks, values of peaks, peak value on freq.
946
+ """
947
+ # flatten the array for correct processing
948
+ array = array.flatten()
949
+
950
+ if np.isscalar(freq):
951
+ # calculate the frerquency scale - assuming a samplerate if none provided
952
+ freq = np.linspace(0, fs/2.0, len(array))
953
+
954
+ if np.isscalar(unprocessed_array):
955
+ unprocessed_array = array
956
+
957
+ # add values to allow peaks at the first and last values
958
+ array_appended = np.insert(array, [0, len(array)], -2.0) # to allow peaks at start and end (default of mir)
959
+ # unprocessed array to get peak values
960
+ array_unprocess_appended = np.insert(unprocessed_array, [0, len(unprocessed_array)], -2.0)
961
+ # append the frequency scale for precise freq calculation
962
+ freq_appended = np.insert(freq, [0, len(freq)], -1.0)
963
+
964
+ # get the difference values
965
+ diff_array = np.diff(array_appended)
966
+
967
+ # find local maxima
968
+ mx = np.array(np.where((array >= cthr) & (diff_array[0:-1] > 0) & (diff_array[1:] <= 0))) + 1
969
+
970
+ # initialise arrays for output
971
+ finalmx = []
972
+ peak_value = []
973
+ peak_x = []
974
+ peak_idx = []
975
+
976
+ if np.size(mx) > 0:
977
+ # unpack the array if peaks found
978
+ mx = mx[0]
979
+
980
+ j = 0 # scans the peaks from beginning to end
981
+ mxj = mx[j] # the current peak under evaluation
982
+ jj = j + 1
983
+ bufmin = 2.0
984
+ bufmax = array_appended[mxj]
985
+
986
+ if mxj > 1:
987
+ oldbufmin = min(array_appended[:mxj-1])
988
+ else:
989
+ oldbufmin = array_appended[0]
990
+
991
+ while jj < len(mx):
992
+ # if adjacent mx values are too close, returns no array
993
+ if mx[jj-1]+1 == mx[jj]-1:
994
+ bufmin = min([bufmin, array_appended[mx[jj-1]]])
995
+ else:
996
+ bufmin = min([bufmin, min(array_appended[mx[jj-1]:mx[jj]-1])])
997
+
998
+ if bufmax - bufmin < cthr:
999
+ # There is no contrastive notch
1000
+ if array_appended[mx[jj]] > bufmax:
1001
+ # new peak is significant;y higher than the old peak,
1002
+ # the peak is transfered to the new position
1003
+ j = jj
1004
+ mxj = mx[j] # the current peak
1005
+ bufmax = array_appended[mxj]
1006
+ oldbufmin = min([oldbufmin, bufmin])
1007
+ bufmin = 2.0
1008
+ elif array_appended[mx[jj]] - bufmax <= 0:
1009
+ bufmax = max([bufmax, array_appended[mx[jj]]])
1010
+ oldbufmin = min([oldbufmin, bufmin])
1011
+
1012
+ else:
1013
+ # There is a contrastive notch
1014
+ if bufmax - oldbufmin < cthr:
1015
+ # But the previous peak candidate is too weak and therefore discarded
1016
+ oldbufmin = min([oldbufmin, bufmin])
1017
+ else:
1018
+ # The previous peak candidate is OK and therefore stored
1019
+ finalmx.append(mxj)
1020
+ oldbufmin = bufmin
1021
+
1022
+ bufmax = array_appended[mx[jj]]
1023
+ j = jj
1024
+ mxj = mx[j] # The current peak
1025
+ bufmin = 2.0
1026
+
1027
+ jj += 1
1028
+ if bufmax - oldbufmin >= cthr and (bufmax - min(array_appended[mx[j] + 1:]) >= cthr):
1029
+ # The last peak candidate is OK and stored
1030
+ finalmx.append(mx[j])
1031
+
1032
+ ''' Sort the values according to their level '''
1033
+ finalmx = np.array(finalmx)
1034
+ sort_idx = np.argsort(array_appended[finalmx])[::-1] # descending sort
1035
+ finalmx = finalmx[sort_idx]
1036
+
1037
+ peak_idx = finalmx - 1 # indexes were for the appended array, -1 to return to original array index
1038
+ peak_value = array_unprocess_appended[finalmx]
1039
+ peak_x = freq_appended[finalmx]
1040
+
1041
+ ''' Interpolation for more precise peak location '''
1042
+ corrected_value = []
1043
+ corrected_position = []
1044
+ for current_peak_idx in finalmx:
1045
+ # if there enough space to do the fitting
1046
+ if 1 < current_peak_idx < (len(array_unprocess_appended) - 2):
1047
+ y0 = array_unprocess_appended[current_peak_idx]
1048
+ ym = array_unprocess_appended[current_peak_idx-1]
1049
+ yp = array_unprocess_appended[current_peak_idx+1]
1050
+ p = (yp - ym) / (2 * (2*y0 - yp - ym))
1051
+ corrected_value.append(y0 - (0.25*(ym-yp)*p))
1052
+ if p >= 0:
1053
+ correct_pos = ((1 - p) * freq_appended[current_peak_idx]) + (p * freq_appended[current_peak_idx+1])
1054
+ corrected_position.append(correct_pos)
1055
+ elif p < 0:
1056
+ correct_pos = ((1 + p) * freq_appended[current_peak_idx]) - (p * freq_appended[current_peak_idx-1])
1057
+ corrected_position.append(correct_pos)
1058
+ else:
1059
+ corrected_value.append(array_unprocess_appended[current_peak_idx])
1060
+ corrected_position.append(freq_appended[current_peak_idx])
1061
+
1062
+ if corrected_position:
1063
+ peak_x = corrected_position
1064
+ peak_value = corrected_value
1065
+
1066
+ return peak_idx, peak_value, peak_x
1067
+
1068
+
1069
+ def sigmoid(x, offset=0.2, n=10):
1070
+ # return a sigmoidal function for weighting values
1071
+ return x ** n / (x ** n + offset)
1072
+
1073
+
1074
+ def channel_reduction(audio_samples, phase_correction=False):
1075
+ """
1076
+ Algorithm for reducing the number of channels in a read-in audio file
1077
+
1078
+ :param audio_samples: audio samples
1079
+ :param phase_correction: perform phase checking on channels before mono sum
1080
+
1081
+ :return: audio samples summed to mono
1082
+ """
1083
+ # get sum all channels to mono
1084
+ num_channels = np.shape(audio_samples)
1085
+ if len(num_channels) > 1:
1086
+ # check for stereo file
1087
+ if num_channels[1] == 2:
1088
+ # crudely check for out of phase signals
1089
+ if phase_correction:
1090
+ r, pval = scipy.stats.pearsonr(audio_samples[:, 0], audio_samples[:, 1])
1091
+ if r < -0.5:
1092
+ audio_samples = audio_samples[:, 0] # [:,1] *= -1.0
1093
+ else:
1094
+ audio_samples = np.sum(audio_samples, axis=1)
1095
+ else:
1096
+ audio_samples = np.sum(audio_samples, axis=1)
1097
+ # check for multi-channel file
1098
+ elif num_channels[1] > 2:
1099
+ # there are multiple layouts for multichannel, I have no way of decoding these with soundfile
1100
+ audio_samples = np.sum(audio_samples[:,0:3], axis=1)
1101
+
1102
+ #TODO Update to include multichannel variants and decode according to: http://www.atsc.org/wp-content/uploads/2015/03/A52-201212-17.pdf
1103
+ # elif num_channels[3] > 4:
1104
+ # elif num_channels[3] > 5:
1105
+ # elif num_channels[3] > 6:
1106
+
1107
+ return audio_samples
1108
+
1109
+
1110
+ def spectral_flux(spectrogram, method='sum'):
1111
+ """
1112
+ This computes the spectral flux: the difference between sucesive spectrogram time frames
1113
+
1114
+ :param spectrogram:
1115
+ :return:
1116
+ """
1117
+ if method == 'sum':
1118
+ # sum method
1119
+ diff_spec = np.diff(spectrogram, axis=1) # difference
1120
+ sum_flux = np.sqrt(np.sum(diff_spec**2, axis=0))/float(diff_spec.shape[0]);
1121
+
1122
+ return sum_flux
1123
+
1124
+ elif method == 'multiply':
1125
+ # multiplication between adjacent frames
1126
+ diff_spec = spectrogram[:,:-1] * spectrogram[:,1:]
1127
+ sum_diff_spec = (np.sum(diff_spec ** 2.0, axis=0)) # variation acorss time
1128
+ orig_spec_var = np.sum(spectrogram[:,:-1]** 2.0, axis=0)
1129
+ delayed_spec_var = np.sum(spectrogram[:,1:]** 2.0, axis=0)
1130
+ denom = orig_spec_var * delayed_spec_var
1131
+
1132
+ multiply_flux = np.nan_to_num(1 - sum_diff_spec / (orig_spec_var * delayed_spec_var))
1133
+
1134
+ return multiply_flux
1135
+
1136
+
1137
+ def log_sum(array):
1138
+ """
1139
+ This function calculates the log sum of an array
1140
+
1141
+ :param array:
1142
+ :return:
1143
+ """
1144
+ logsum = 10 * np.log10(np.sum(10 ** (array / 10.0)))
1145
+
1146
+ return logsum
1147
+
1148
+
1149
+ def filter_design2(Fc, fs, N):
1150
+ """
1151
+ Design Butterworth 2nd-order one-third-octave filter.
1152
+ """
1153
+
1154
+ f1 = (2.0 ** (-1.0/6)) * Fc
1155
+ f2 = (2.0 ** (1.0/6)) * Fc
1156
+ f1 = f1 / (fs / 2.0)
1157
+ f2 = f2 / (fs / 2.0)
1158
+
1159
+ # force f2 to be 1.0 for cases where the upper bandwidth from 3rd_octave_downsample produce higher frequencies
1160
+ if f2 >= 1.0:
1161
+ f2 = 0.9999999999
1162
+ b, a = scipy.signal.butter(N, [f1, f2], 'bandpass')
1163
+ return b, a
1164
+
1165
+
1166
+ def midbands(Fmin, Fmax, fs):
1167
+ """
1168
+ Divides the frequency range into third octave bands using filters
1169
+ Fmin is the minimum third octave band
1170
+ Fmax is the maximum third octave band
1171
+ """
1172
+
1173
+ # set defaults
1174
+ lowest_band = 25
1175
+ highest_band = 20000
1176
+ Nyquist_frequency = fs / 2.0
1177
+ FUpper = (2 ** (1/6.0)) * Fmax
1178
+
1179
+ fr = 1000 # reference frequency is 1000Hz
1180
+ i = np.arange(-16, 14, 1)
1181
+ lab_freq = np.array([25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600,
1182
+ 2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500, 16000, 20000])
1183
+
1184
+ A = np.where(lab_freq == Fmin)[0][0]
1185
+ B = np.where(lab_freq == Fmax)[0][0]
1186
+
1187
+ # compare value of B to nyquist
1188
+ while lab_freq[B] > Nyquist_frequency:
1189
+ B -= 1
1190
+
1191
+
1192
+ j = i[np.arange(A, B+1, 1)] # indices to find exact midband frequencies
1193
+ ff = (2.0 ** (j / 3.0)) * fr # Exact midband frequencies (Calculated as base two exact)
1194
+ F = lab_freq[np.arange(A, B+1, 1)]
1195
+ return ff, F, j
1196
+
1197
+
1198
+ def filter_third_octaves_downsample(x, Pref, fs, Fmin, Fmax, N):
1199
+ """
1200
+ Filters the audio file into thrid octave bands
1201
+ x is the file (Input length must be a multiple of 2^8)
1202
+ Pref is the reference level for calculating decibels - does not allow for negative values
1203
+ Fmin is the minimum frequency
1204
+ Fmax is the maximum frequency (must be at least 2500 Hz)
1205
+ Fs is the sampling frequency
1206
+ N is the filter order
1207
+ """
1208
+ # identify midband frequencies
1209
+ [ff, F, j] = midbands(Fmin, Fmax, fs)
1210
+
1211
+ # apply filters
1212
+ P = np.zeros(len(j))
1213
+ k = np.where(j == 7)[0][0] # Determines where downsampling will commence (5000 Hz and below)
1214
+ m = len(x)
1215
+
1216
+ # For frequencies of 6300 Hz or higher, direct implementation of filters.
1217
+ for i in range(len(j)-1, k, -1):
1218
+ B, A = filter_design2(ff[i], fs, N)
1219
+ if i==k+3: # Upper 1/3-oct. band in last octave.
1220
+ Bu = B;
1221
+ Au = A;
1222
+ if i == k + 2: # Center 1/3-oct. band in last octave.
1223
+ Bc = B;
1224
+ Ac = A;
1225
+ if i == k + 1: # Lower 1/3-oct. band in last octave.
1226
+ Bl = B;
1227
+ Al = A;
1228
+ y = scipy.signal.lfilter(B, A, x);
1229
+ if np.max(y) > 0:
1230
+ P[i] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0) / m)) # Convert to decibels.
1231
+ else:
1232
+ P[i] = -1.0 * np.inf
1233
+
1234
+ # 5000 Hz or lower, multirate filter implementation.
1235
+ try:
1236
+ for i in range(k, 1, -3): #= k:-3:1;
1237
+ # Design anti-aliasing filter (IIR Filter)
1238
+ Wn = 0.4
1239
+ C, D = scipy.signal.cheby1(2, 0.1, Wn)
1240
+ # Filter
1241
+ x = scipy.signal.lfilter(C, D, x)
1242
+ # Downsample
1243
+ idx = np.arange(1, len(x), 2)
1244
+ x = x[idx]
1245
+ fs = fs / 2.0
1246
+ m = len(x)
1247
+ # Performs the filtering
1248
+ y = scipy.signal.lfilter(Bu, Au, x)
1249
+ if np.max(y) > 0:
1250
+ P[i] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0)/m))
1251
+ else:
1252
+ P[i] = -1.0 * np.inf
1253
+ y = scipy.signal.lfilter(Bc, Ac, x)
1254
+ if np.max(y) > 0:
1255
+ P[i-1] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0)/m))
1256
+ else:
1257
+ P[i-1] = -1.0 * np.inf
1258
+ y = scipy.signal.lfilter(Bl, Al, x)
1259
+ if np.max(y) > 0:
1260
+ P[i - 2] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0) / m))
1261
+ else:
1262
+ P[i-2] = -1.0 * np.inf
1263
+ except:
1264
+ P = P[1:len(j)]
1265
+
1266
+ # "calibrate" the readings based from Pref, chosen as 100 in most uses
1267
+ P = P + Pref
1268
+
1269
+ # log transformation
1270
+ Plog = 10 ** (P / 10.0)
1271
+ Ptotal = np.sum(Plog)
1272
+ if Ptotal > 0:
1273
+ Ptotal = 10 * np.log10(Ptotal)
1274
+ else:
1275
+ Ptotal = -1.0 * np.inf
1276
+
1277
+ return Ptotal, P, F
1278
+
1279
+
1280
+ def specific_loudness(x, Pref, fs, Mod):
1281
+ """
1282
+ Calculates loudness in 3rd octave bands
1283
+ based on ISO 532 B / DIN 45631
1284
+ Source: BASIC code in J Acoust Soc Jpn(E) 12, 1(1991)
1285
+ x = signal
1286
+ Pref = refernce value[dB]
1287
+ fs = sampling frequency[Hz]
1288
+ Mod = 0 for free field
1289
+ Mod = 1 for diffuse field
1290
+
1291
+ Returns
1292
+ N_entire = entire loudness[sone]
1293
+ N_single = partial loudness[sone / Bark]
1294
+
1295
+ Original Matlab code by Claire Churchill Jun. 2004
1296
+ Transcoded by Andy Pearce 2018
1297
+ """
1298
+
1299
+ # 'Generally used third-octave band filters show a leakage towards neighbouring filters of about -20 dB. This
1300
+ # means that a 70dB, 1 - kHz tone produces the following levels at different centre
1301
+ # frequencies: 10dB at 500Hz, 30dB at 630Hz, 50dB at 800Hz and 70dB at 1kHz.
1302
+ # P211 Psychoacoustics: Facts and Models, E.Zwicker and H.Fastl
1303
+ # (A filter order of 4 gives approx this result)
1304
+
1305
+ # set default
1306
+ Fmin = 25
1307
+ Fmax = 12500
1308
+ order = 4
1309
+ # filter the audio
1310
+ Ptotal, P, F = filter_third_octaves_downsample(x, Pref, fs, Fmin, Fmax, order);
1311
+
1312
+
1313
+ # set more defaults for perceptual filters
1314
+
1315
+ # Centre frequencies of 1 / 3 Oct bands(FR)
1316
+ FR = np.array([25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600,
1317
+ 2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500])
1318
+
1319
+ # Ranges of 1 / 3 Oct bands for correction at low frequencies according to equal loudness contours
1320
+ RAP = np.array([45, 55, 65, 71, 80, 90, 100, 120])
1321
+
1322
+ # Reduction of 1/3 Oct Band levels at low frequencies according to equal loudness contours
1323
+ # within the eight ranges defined by RAP(DLL)
1324
+ DLL = np.array([[-32, -24, -16, -10, -5, 0, -7, -3, 0, -2, 0],
1325
+ [-29, -22, -15, -10, -4, 0, -7, -2, 0, -2, 0],
1326
+ [-27, -19, -14, -9, -4, 0, -6, -2, 0, -2, 0],
1327
+ [-25, -17, -12, -9, -3, 0, -5, -2, 0, -2, 0],
1328
+ [-23, -16, -11, -7, -3, 0, -4, -1, 0, -1, 0],
1329
+ [-20, -14, -10, -6, -3, 0, -4, -1, 0, -1, 0],
1330
+ [-18, -12, -9, -6, -2, 0, -3, -1, 0, -1, 0],
1331
+ [-15, -10, -8, -4, -2, 0, -3, -1, 0, -1, 0]])
1332
+
1333
+ # Critical band level at absolute threshold without taking into account the
1334
+ # transmission characteristics of the ear
1335
+ LTQ = np.array([30, 18, 12, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]) # Threshold due to internal noise
1336
+ # Hearing thresholds for the excitation levels (each number corresponds to a critical band 12.5kHz is not included)
1337
+
1338
+ # Attenuation representing transmission between freefield and our hearing system
1339
+ A0 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5, -1.6, -3.2, -5.4, -5.6, -4, -1.5, 2, 5, 12])
1340
+ # Attenuation due to transmission in the middle ear
1341
+ # Moore et al disagrees with this being flat for low frequencies
1342
+
1343
+ # Level correction to convert from a free field to a diffuse field(last critical band 12.5 kHz is not included)
1344
+ DDF = np.array([0, 0, 0.5, 0.9, 1.2, 1.6, 2.3, 2.8, 3, 2, 0, -1.4, -2, -1.9, -1, 0.5, 3, 4, 4.3, 4])
1345
+
1346
+ # Correction factor because using third octave band levels(rather than critical bands)
1347
+ DCB = np.array([-0.25, -0.6, -0.8, -0.8, -0.5, 0, 0.5, 1.1, 1.5, 1.7, 1.8, 1.8, 1.7, 1.6, 1.4, 1.2, 0.8,
1348
+ 0.5, 0, -0.5])
1349
+
1350
+ # Upper limits of the approximated critical bands
1351
+ ZUP = np.array([0.9, 1.8, 2.8, 3.5, 4.4, 5.4, 6.6, 7.9, 9.2, 10.6, 12.3, 13.8, 15.2, 16.7, 18.1, 19.3, 20.6, 21.8,
1352
+ 22.7, 23.6, 24])
1353
+
1354
+ # Range of specific loudness for the determination of the steepness of the upper slopes in the specific loudness
1355
+ # - critical band rate pattern(used to plot the correct USL curve)
1356
+ RNS = np.array([21.5, 18, 15.1, 11.5, 9, 6.1, 4.4, 3.1, 2.13, 1.36, 0.82, 0.42, 0.30, 0.22, 0.15, 0.10, 0.035, 0])
1357
+
1358
+ # This is used to design the right hand slope of the loudness
1359
+ USL = np.array([[13.0, 8.2, 6.3, 5.5, 5.5, 5.5, 5.5, 5.5],
1360
+ [9.0, 7.5, 6.0, 5.1, 4.5, 4.5, 4.5, 4.5],
1361
+ [7.8, 6.7, 5.6, 4.9, 4.4, 3.9, 3.9, 3.9],
1362
+ [6.2, 5.4, 4.6, 4.0, 3.5, 3.2, 3.2, 3.2],
1363
+ [4.5, 3.8, 3.6, 3.2, 2.9, 2.7, 2.7, 2.7],
1364
+ [3.7, 3.0, 2.8, 2.35, 2.2, 2.2, 2.2, 2.2],
1365
+ [2.9, 2.3, 2.1, 1.9, 1.8, 1.7, 1.7, 1.7],
1366
+ [2.4, 1.7, 1.5, 1.35, 1.3, 1.3, 1.3, 1.3],
1367
+ [1.95, 1.45, 1.3, 1.15, 1.1, 1.1, 1.1, 1.1],
1368
+ [1.5, 1.2, 0.94, 0.86, 0.82, 0.82, 0.82, 0.82],
1369
+ [0.72, 0.67, 0.64, 0.63, 0.62, 0.62, 0.62, 0.62],
1370
+ [0.59, 0.53, 0.51, 0.50, 0.42, 0.42, 0.42, 0.42],
1371
+ [0.40, 0.33, 0.26, 0.24, 0.24, 0.22, 0.22, 0.22],
1372
+ [0.27, 0.21, 0.20, 0.18, 0.17, 0.17, 0.17, 0.17],
1373
+ [0.16, 0.15, 0.14, 0.12, 0.11, 0.11, 0.11, 0.11],
1374
+ [0.12, 0.11, 0.10, 0.08, 0.08, 0.08, 0.08, 0.08],
1375
+ [0.09, 0.08, 0.07, 0.06, 0.06, 0.06, 0.06, 0.05],
1376
+ [0.06, 0.05, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02]])
1377
+
1378
+ # apply weighting factors
1379
+ Xp = np.zeros(11)
1380
+ Ti = np.zeros(11)
1381
+ for i in range(11):
1382
+ j = 0
1383
+ while (P[i] > (RAP[j] - DLL[j, i])) & (j < 7):
1384
+ j += 1
1385
+ Xp[i] = P[i] + DLL[j, i]
1386
+ Ti[i] = 10.0 ** (Xp[i] / 10.0)
1387
+
1388
+ # Intensity values in first three critical bands calculated
1389
+ Gi = np.zeros(3)
1390
+ Gi[0] = np.sum(Ti[0:6]) # Gi(1) is the first critical band (sum of two octaves(25Hz to 80Hz))
1391
+ Gi[1] = np.sum(Ti[6:9]) # Gi(2) is the second critical band (sum of octave(100Hz to 160Hz))
1392
+ Gi[2] = np.sum(Ti[9:11]) # Gi(3) is the third critical band (sum of two third octave bands(200Hz to 250Hz))
1393
+
1394
+ if np.max(Gi) > 0.0:
1395
+ FNGi = 10 * np.log10(Gi)
1396
+ else:
1397
+ FNGi = -1.0 * np.inf
1398
+ LCB = np.zeros_like(Gi)
1399
+ for i in range(3):
1400
+ if Gi[i] > 0:
1401
+ LCB[i] = FNGi[i]
1402
+ else:
1403
+ LCB[i] = 0
1404
+
1405
+ # Calculate the main loudness in each critical band
1406
+ Le = np.ones(20)
1407
+ Lk = np.ones_like(Le)
1408
+ Nm = np.ones(21)
1409
+ for i in range(20):
1410
+ Le[i] = P[i+8]
1411
+ if i <= 2:
1412
+ Le[i] = LCB[i]
1413
+ Lk[i] = Le[i] - A0[i]
1414
+ Nm[i] = 0
1415
+ if Mod == 1:
1416
+ Le[i] = Le[i] + DDF[i]
1417
+ if Le[i] > LTQ[i]:
1418
+ Le[i] = Lk[i] - DCB[i]
1419
+ S = 0.25
1420
+ MP1 = 0.0635 * 10.0 ** (0.025 * LTQ[i])
1421
+ MP2 = (1 - S + S * 10 ** (0.1 * (Le[i] - LTQ[i]))) ** 0.25 - 1
1422
+ Nm[i] = MP1 * MP2;
1423
+ if Nm[i] <= 0:
1424
+ Nm[i] = 0
1425
+ Nm[20] = 0
1426
+
1427
+ KORRY = 0.4 + 0.32 * Nm[0] ** 0.2
1428
+ if KORRY > 1:
1429
+ KORRY = 1
1430
+
1431
+ Nm[0] = Nm[0] * KORRY
1432
+
1433
+ # Add masking curves to the main loudness in each third octave band
1434
+ N = 0
1435
+ z1 = 0 # critical band rate starts at 0
1436
+ n1 = 0 # loudness level starts at 0
1437
+ j = 17
1438
+ iz = 0
1439
+ z = 0.1
1440
+ ns = []
1441
+
1442
+ for i in range(21):
1443
+ # Determines where to start on the slope
1444
+ ig = i-1
1445
+ if ig > 7:
1446
+ ig = 7
1447
+ control = 1
1448
+ while (z1 < ZUP[i]) | (control == 1): # ZUP is the upper limit of the approximated critical band
1449
+ # Determines which of the slopes to use
1450
+ if n1 < Nm[i]: # Nm is the main loudness level
1451
+ j = 0
1452
+ while RNS[j] > Nm[i]: # the value of j is used below to build a slope
1453
+ j += 1 # j becomes the index at which Nm(i) is first greater than RNS
1454
+
1455
+ # The flat portions of the loudness graph
1456
+ if n1 <= Nm[i]:
1457
+ z2 = ZUP[i] # z2 becomes the upper limit of the critical band
1458
+ n2 = Nm[i]
1459
+ N = N + n2 * (z2 - z1) # Sums the output(N_entire)
1460
+ for k in np.arange(z, z2+0.01, 0.1):
1461
+ if not ns:
1462
+ ns.append(n2)
1463
+ else:
1464
+ if iz == len(ns):
1465
+ ns.append(n2)
1466
+ elif iz < len(ns):
1467
+ ns[iz] = n2
1468
+
1469
+ if k < (z2 - 0.05):
1470
+ iz += 1
1471
+ z = k # z becomes the last value of k
1472
+ z = round(z * 10) * 0.1
1473
+
1474
+ # The sloped portions of the loudness graph
1475
+ if n1 > Nm[i]:
1476
+ n2 = RNS[j]
1477
+ if n2 < Nm[i]:
1478
+ n2 = Nm[i]
1479
+ dz = (n1 - n2) / USL[j, ig] # USL = slopes
1480
+ dz = round(dz * 10) * 0.1
1481
+ if dz == 0:
1482
+ dz = 0.1
1483
+ z2 = z1 + dz
1484
+ if z2 > ZUP[i]:
1485
+ z2 = ZUP[i]
1486
+ dz = z2 - z1
1487
+ n2 = n1 - dz * USL[j, ig] # USL = slopes
1488
+ N = N + dz * (n1 + n2) / 2.0 # Sums the output(N_entire)
1489
+ for k in np.arange(z, z2+0.01, 0.1):
1490
+ if not ns:
1491
+ ns.append(n1 - (k - z1) * USL[j, ig])
1492
+ else:
1493
+ if iz == len(ns):
1494
+ ns.append(n1 - (k - z1) * USL[j, ig])
1495
+ elif iz < len(ns):
1496
+ ns[iz] = n1 - (k - z1) * USL[j, ig]
1497
+ if k < (z2 - 0.05):
1498
+ iz += 1
1499
+ z = k
1500
+ z = round(z * 10) * 0.1
1501
+ if n2 == RNS[j]:
1502
+ j += 1
1503
+ if j > 17:
1504
+ j = 17
1505
+ n1 = n2
1506
+ z1 = z2
1507
+ z1 = round(z1 * 10) * 0.1
1508
+ control += 1
1509
+
1510
+ if N < 0:
1511
+ N = 0
1512
+
1513
+ if N <= 16:
1514
+ N = np.floor(N * 1000 + 0.5) / 1000.0
1515
+ else:
1516
+ N = np.floor(N * 100 + .05) / 100.0
1517
+
1518
+ LN = 40.0 * (N + 0.0005) ** 0.35
1519
+
1520
+ if LN < 3:
1521
+ LN = 3
1522
+
1523
+ if N >= 1:
1524
+ LN = 10 * np.log10(N) / np.log10(2) + 40;
1525
+
1526
+ N_single = np.zeros(240)
1527
+ for i in range(240):
1528
+ N_single[i] = ns[i]
1529
+
1530
+ N_entire = N
1531
+ return N_entire, N_single
1532
+
1533
+
1534
+ def output_clip(score, min_score=0, max_score=100):
1535
+ """
1536
+ Limits the output of the score between min_score and max_score
1537
+
1538
+ :param score:
1539
+ :param min_score:
1540
+ :param max_score:
1541
+ :return:
1542
+ """
1543
+ if score < min_score:
1544
+ return 0.0
1545
+ elif score > max_score:
1546
+ return 100.0
1547
+ else:
1548
+ return score
1549
+
1550
+
1551
+ def fast_hilbert(array, use_matlab_hilbert=False):
1552
+ """
1553
+ Calculates the hilbert transform of the array by segmenting signal first to speed up calculation.
1554
+ :param array:
1555
+ :return:
1556
+ """
1557
+ step_size = 32768
1558
+ overlap = 2
1559
+ overlap_size = int(step_size/(2*overlap))
1560
+ # how many steps, rounded to nearest int
1561
+ # step_no = int((len(array) / (step_size - overlap)) + 0.5)
1562
+ step_start = 0
1563
+ hold_hilbert = np.array([])
1564
+ while (step_start + step_size) < len(array):
1565
+ hold_array = array[step_start:step_start+step_size]
1566
+ if use_matlab_hilbert:
1567
+ this_hilbert = np.abs(matlab_hilbert(hold_array))
1568
+ else:
1569
+ this_hilbert = np.abs(scipy.signal.hilbert(hold_array))
1570
+
1571
+ if step_start == 0:
1572
+ # try to concatonate the results
1573
+ hold_hilbert = np.concatenate((hold_hilbert,this_hilbert[:3*overlap_size]))
1574
+ else:
1575
+ hold_hilbert = np.concatenate((hold_hilbert, this_hilbert[overlap_size:3*overlap_size]))
1576
+
1577
+ # increment the step
1578
+ step_start += int(step_size/overlap)
1579
+
1580
+ # do the last step
1581
+ hold_array = array[step_start:]
1582
+ this_hilbert = np.abs(scipy.signal.hilbert(hold_array))
1583
+
1584
+ # try to concatonate the results
1585
+ hold_hilbert = np.concatenate((hold_hilbert, this_hilbert[overlap_size:]))
1586
+ return hold_hilbert
1587
+
1588
+
1589
+ def fast_hilbert_spectrum(array, use_matlab_hilbert=False):
1590
+ """
1591
+ Calculates the hilbert transform of the array by segmenting signal first to speed up calculation.
1592
+ :param array:
1593
+ :return:
1594
+ """
1595
+ step_size = 32768
1596
+ overlap = 2
1597
+ overlap_size = int(step_size/(2*overlap))
1598
+ step_start = 0
1599
+ hold_HILBERT = []
1600
+ if (step_start + step_size) < len(array):
1601
+ while (step_start + step_size) < len(array):
1602
+ hold_array = array[step_start:step_start+step_size]
1603
+ if use_matlab_hilbert:
1604
+ this_hilbert = np.abs(matlab_hilbert(hold_array))
1605
+ else:
1606
+ this_hilbert = np.abs(scipy.signal.hilbert(hold_array))
1607
+
1608
+ HILBERT = np.abs(np.fft.fft(np.abs(this_hilbert)))
1609
+ HILBERT = HILBERT[0:int(len(HILBERT) / 2.0)] # take the real part
1610
+ hold_HILBERT.append(HILBERT)
1611
+
1612
+ step_start += int(step_size/overlap)
1613
+
1614
+ # hilbert_spectrum = np.sum(hold_HILBERT, axis=0)
1615
+ hilbert_spectrum = np.mean(hold_HILBERT, axis=0)
1616
+
1617
+ else:
1618
+ # how much to pad by
1619
+ array = np.pad(array, (0, step_size - len(array)), 'constant', constant_values=0.0)
1620
+
1621
+ if use_matlab_hilbert:
1622
+ this_hilbert = np.abs(matlab_hilbert(array))
1623
+ else:
1624
+ this_hilbert = np.abs(scipy.signal.hilbert(array))
1625
+
1626
+ HILBERT = np.abs(np.fft.fft(np.abs(this_hilbert)))
1627
+ HILBERT = HILBERT[0:int(len(HILBERT) / 2.0)] # take the real part
1628
+
1629
+ hilbert_spectrum = HILBERT
1630
+
1631
+ return hilbert_spectrum
1632
+
1633
+
1634
+ def matlab_hilbert(signal):
1635
+ '''
1636
+ Define a method for calculating the hilbert transform of a 1D array using the method from Matlab
1637
+
1638
+ :param signal:
1639
+ :return:
1640
+ '''
1641
+ # get the fft
1642
+ n = len(signal)
1643
+ x = np.fft.fft(signal)
1644
+ h = np.zeros(n)
1645
+
1646
+ if (n>0) and (~isodd(n)):
1647
+ # even and nonempty
1648
+ h[0] = 1
1649
+ h[int(n/2)] = 1
1650
+ h[1:int(n/2)] = 2
1651
+ elif n>0:
1652
+ # odd and nonempty
1653
+ h[0] = 1
1654
+ h[1:int((n+1)/2.0)] = 2
1655
+
1656
+ # this is the hilbert bit
1657
+ x = np.fft.ifft(x * h)
1658
+
1659
+ return x
1660
+
1661
+
1662
+
1663
+ def isodd(num):
1664
+ return num & 0x1
1665
+
1666
+
1667
+ def window_audio(audio_samples, window_length=4096):
1668
+ """
1669
+ Segment the audio samples into a numpy array the correct size and shape, so that each row is a new window of audio
1670
+ :param audio_samples:
1671
+ :param window_length:
1672
+ :param overlap:
1673
+ :return:
1674
+ """
1675
+ remainder = np.mod(len(audio_samples), window_length) # how many samples are left after division
1676
+
1677
+ #zero pad audio samples
1678
+ audio_samples = np.pad(audio_samples, (0, int(window_length-remainder)), 'constant', constant_values=0.0)
1679
+ windowed_samples = np.reshape(audio_samples, (int(len(audio_samples) / window_length), int(window_length)))
1680
+
1681
+ return windowed_samples
1682
+
1683
+
1684
+ def normal_dist(array, theta=1.0, mean=0.0):
1685
+ y = (1.0 / (theta * np.sqrt(2.0 * np.pi))) * np.exp((-1.0 * ((array - mean)**2.0)) / 2.0 * (theta ** 2.0))
1686
+ return y
1687
+
1688
+
1689
+ def weighted_bark_level(audio_samples, fs, low_bark_band=0, upper_bark_band=240):
1690
+ #window the audio
1691
+ windowed_samples = window_audio(audio_samples)
1692
+
1693
+ # need to define a function for the roughness stimuli, emphasising the 20 - 40 region (of the bark scale)
1694
+ mean_bark_band = (low_bark_band + upper_bark_band) / 2.0
1695
+ array = np.arange(low_bark_band, upper_bark_band)
1696
+ x = normal_dist(array, theta=0.01, mean=mean_bark_band)
1697
+ x -= np.min(x)
1698
+ x /= np.max(x)
1699
+
1700
+ weight_array = np.zeros(240)
1701
+ weight_array[low_bark_band:upper_bark_band] = x
1702
+
1703
+ windowed_loud_spec = []
1704
+ windowed_rms = []
1705
+ weighted_vals = []
1706
+
1707
+ for i in range(windowed_samples.shape[0]):
1708
+ samples = windowed_samples[i, :]
1709
+ N_entire, N_single = specific_loudness(samples, Pref=100.0, fs=fs, Mod=0)
1710
+
1711
+ # append the loudness spec
1712
+ windowed_loud_spec.append(N_single)
1713
+ windowed_rms.append(np.sqrt(np.mean(samples * samples)))
1714
+ weighted_vals.append(np.sum(weight_array * N_single))
1715
+
1716
+ mean_weight = np.mean(weighted_vals)
1717
+ weighted_weight = np.average(weighted_vals, weights=windowed_rms)
1718
+
1719
+ return mean_weight, weighted_weight
1720
+
1721
+
1722
+
1723
+ '''
1724
+ Loudnorm function to be included in future update
1725
+ '''
1726
+ def loud_norm(audio, fs=44100, target_loudness=-24.0):
1727
+ '''
1728
+ Takes in audio data and returns the same audio loudness normalised
1729
+ :param audio:
1730
+ :param fs:
1731
+ :param target_loudness:
1732
+ :return:
1733
+ '''
1734
+ meter = pyln.Meter(fs)
1735
+
1736
+ # minimum length of file is 0.4 seconds
1737
+ if len(audio) < (fs * 0.4):
1738
+ # how much longer does the file need to be?
1739
+ samples_needed = int(fs * 0.4) - len(audio)
1740
+
1741
+ # zero pad signal
1742
+ len_check_audio = np.pad(audio, (0, samples_needed), 'constant', constant_values=0.0)
1743
+ else:
1744
+ len_check_audio = audio
1745
+
1746
+ # assess the current loudness
1747
+ current_loudness = meter.integrated_loudness(len_check_audio)
1748
+ normalised_audio = pyln.normalize.loudness(audio, current_loudness, target_loudness)
1749
+
1750
+ # check for clipping and reduce level
1751
+ if np.max(np.abs(normalised_audio)) > 1.0:
1752
+ normalised_audio /= np.max(np.abs(normalised_audio))
1753
+
1754
+ return normalised_audio
1755
+
1756
+
1757
+
1758
+
1759
+
1760
+ def file_read(fname, fs=0, phase_correction=False, mono_sum=True, loudnorm=True, resample_low_fs=True):
1761
+ """
1762
+ Read in audio file, but check if it's already an array
1763
+ Return samples if already an array.
1764
+ :param fname:
1765
+ :return:
1766
+ """
1767
+ if isinstance(fname, six.string_types):
1768
+ # use pysoundfile to read audio
1769
+ audio_samples, fs = sf.read(fname, always_2d=False)
1770
+
1771
+ elif hasattr(fname, 'shape'):
1772
+ if fs==0:
1773
+ raise ValueError('If giving function an array, \'fs\' must be specified')
1774
+ audio_samples = fname
1775
+
1776
+ else:
1777
+ raise TypeError('Input type of \'fname\' must be string, or have a shape attribute (e.g. a numpy array)')
1778
+
1779
+ # check audio file contains data
1780
+ if audio_samples.size==0:
1781
+ raise ValueError('Input audio file does not contain data')
1782
+
1783
+ # reduce to mono
1784
+ if mono_sum:
1785
+ audio_samples = channel_reduction(audio_samples, phase_correction)
1786
+
1787
+ # check data has values
1788
+ if np.max(np.abs(audio_samples)) == 0.0:
1789
+ raise ValueError('Input file is silence, cannot be analysed.')
1790
+
1791
+ # loudness normalise
1792
+ if loudnorm:
1793
+ audio_samples = loud_norm(audio_samples, fs, target_loudness=-24.0)
1794
+
1795
+ if resample_low_fs:
1796
+ # check if upsampling required and perform to avoid errors
1797
+ audio_samples, fs = check_upsampling(audio_samples, fs)
1798
+
1799
+ return audio_samples, fs
1800
+
1801
+
1802
+
1803
+ def check_upsampling(audio_samples, fs, lowest_fs=44100):
1804
+ """
1805
+ Check if upsampling needfs to be applied, then perform it if necessary
1806
+
1807
+ :param audio_samples:
1808
+ :param fs:
1809
+ :return:
1810
+ """
1811
+ if fs < lowest_fs:
1812
+ # upsample file to avoid errors when calculating specific loudness
1813
+ audio_samples = librosa.core.resample(audio_samples, fs, lowest_fs)
1814
+ fs = lowest_fs
1815
+
1816
+ return audio_samples, fs