nbm_v1 / prepare_dataset.py
unknown
add app files
82b8273
import numpy as np
import os
import time
from utils import *
import librosa
import h5py
import pickle
from operator import itemgetter
ornithos = {
'NidalIssa': {
'extra_label': ''
},
'KevinLeveque': {
'extra_label': ''
},
'HerveRenaudineau': {
'extra_label': ''
},
'GuillaumeBigayon': {
'extra_label': ''
},
'GhislainRiou': {
'extra_label': ''
},
'GaëtanMineau': {
'extra_label': ''
},
'FredericCazaban': {
'extra_label': ''
},
'ChristopheMercier': {
'extra_label': ''
},
'AymericMousseau': {
'extra_label': 'amousseau_'
},
'AdrienPajot': {
'extra_label': ''
},
'WillyRaitiere': {
'extra_label': 'willyraitiere_'
},
'MaxencePajot': {
'extra_label': 'Piste de marqueur'
},
'MathurinAubry': {
'extra_label': ''
},
'LionelManceau': {
'extra_label': ''
}
}
def prepare_dataset(parent_directory, test_files, export_negative=False, pcen=False, freq_accuracy=33.3, dt=0.003, overlap_spectro=0.4, w_pix=512, extra_label='',
split=None, normalize='', suppress_others=True, new_data=False):
"""
Loops over birder directories, calling process_birder_directory at turns. This creates a spectrogram img dataset of 256*512 img size, along with
a list of annotations under the form of bounding box coordinates around each bird calls in the images. Additional info is saved along bb coordinates,
such as birder and file name.
"""
if split is not None:
if split == 1:
ornithos = {
'NidalIssa': {
'extra_label': ''
},
'KevinLeveque': {
'extra_label': ''
},
'HerveRenaudineau': {
'extra_label': ''
},
'GuillaumeBigayon': {
'extra_label': ''
},
'GhislainRiou': {
'extra_label': ''
},
'GaëtanMineau': {
'extra_label': ''
}
}
extra_label += '_one'
elif split == 2:
ornithos = {
'WillyRaitiere': {
'extra_label': 'willyraitiere_'
}
}
extra_label += '_two'
else:
ornithos = {
'AdrienPajot': {
'extra_label': ''
},
'MaxencePajot': {
'extra_label': 'Piste de marqueur'
},
'MathurinAubry': {
'extra_label': ''
},
'LionelManceau': {
'extra_label': ''
},
'FredericCazaban': {
'extra_label': ''
},
'ChristopheMercier': {
'extra_label': ''
},
'AymericMousseau': {
'extra_label': 'amousseau_'
}
}
extra_label += '_three'
if new_data:
ornithos = {
'mediae': {'extra_label': ''}
}
extra_label += '_2'
t = time.time()
img_db = []
annotations = pd.DataFrame()
for idx, dic in ornithos.items():
directory = os.path.join(parent_directory, idx)
extra_str_label = dic['extra_label']
birder_img_db, birder_annotations, increment_birder = _process_birder_directory(directory, test_files, extra_str_label, pcen, freq_accuracy, dt, overlap_spectro,
w_pix, normalize, suppress_others, new_data)
img_db += birder_img_db
annotations = pd.concat([annotations, birder_annotations])
print(f'\n Ornitho: {idx}, time: {time.time() - t}, n_files: {increment_birder}')
print('*** Loading large files info ***')
for f in os.listdir(parent_directory):
if 'temp_large_file' in f:
with open(os.path.join(parent_directory, f), 'rb') as file:
large_file_dict = pickle.load(file)
img_db += large_file_dict['img_db']
annotations = pd.concat([annotations, large_file_dict['labels']])
os.remove(os.path.join(parent_directory, f))
print('*** Extracting positive and negative samples ***')
annotations['index'] = range(len(annotations))
# positive_annotations = annotations.loc[annotations['coord'].notnull()].copy()
# Stack positive and test samples into a single 4D array
if len(test_files) > 0:
test_annotations = annotations.loc[(annotations['coord'].notnull()) & (annotations['filename'].isin(test_files))].sort_values('index').copy()
test_idx = test_annotations['index'].values
test_img = np.stack(itemgetter(*test_idx)(img_db))
train_annotations = annotations.loc[~annotations['filename'].isin(test_files)].sort_values('index').copy()
train_idx = train_annotations['index'].values
train_img = np.stack(itemgetter(*train_idx)(img_db))
print(f'Extraction done in {time.time() - t}, serializing')
# Serialization of positive & test samples
dict_to_serialize = {
'train':{
'img_db': train_img,
'labels': train_annotations
}
}
if len(test_files) > 0:
dict_to_serialize.update({
'test':{
'img_db': test_img,
'labels': test_annotations
}
})
# Serialization of positive & test samples
for key, value in dict_to_serialize.items():
with h5py.File(os.path.join(parent_directory, f'{key}_img_db{extra_label}.hdf5'), 'w') as f:
f.create_dataset('img_db', data=value['img_db'])
with h5py.File(os.path.join(parent_directory, f'{key}_annotations{extra_label}.hdf5'), 'w') as f:
for idx in range(len(value['labels'])):
grp = f.create_group(str(idx))
subds = value['labels'].iloc[idx]
bb_coord = np.vstack(subds.coord)
grp.create_dataset('bb_coord', data=bb_coord)
for key in ['bird_id', 'filename', 'birder']:
grp.create_dataset(key, data=subds[key])
print(f'** done serializing positive samples, now mb turn of negative samples: {time.time() - t}')
# Serialization of negative test samples
if len(test_files) > 0:
negative_samples = annotations.loc[(annotations['coord'].isnull()) & (annotations['filename'].isin(test_files))].sort_values('index').copy()
negative_idx = negative_samples['index'].values
negative_img = np.stack(itemgetter(*negative_idx)(img_db))
with h5py.File(os.path.join(parent_directory, f'test_negative_img_db{extra_label}.hdf5'), 'w') as f:
f.create_dataset('img_db', data=negative_img)
print(f'** done serializing negative test samples: {time.time() - t}')
# Serialization of negative samples
if export_negative:
negative_samples = annotations.loc[annotations['coord'].isnull()].copy()
negative_idx = negative_samples['index'].values
negative_img = np.stack(itemgetter(*negative_idx)(img_db))
with h5py.File(os.path.join(parent_directory, 'negative_img_db.hdf5'), 'w') as f:
f.create_dataset('img_db', data=negative_img)
print(f'** done serializing negative_samples samples: {time.time() - t}')
print(f'Done in {time.time() - t}')
def process_data_directory(directory, pcen, freq_accuracy, dt, overlap_spectro, w_pix, normalize=''):
img_db = []
### Iterate over wav files
increment = 0
failed = 0
for f in os.listdir(directory):
if (f.endswith('.wav')) or (f.endswith('.WAV')):
fp = File_Processor(directory, f)
new_img_db = fp.process_file(pcen=pcen, freq_accuracy=freq_accuracy, dt=dt, overlap_spectro=overlap_spectro, w_pix=w_pix, normalize=normalize)
if len(new_img_db) == 0:
failed += 1
img_db += new_img_db
increment += 1
return img_db, increment, failed
def _process_birder_directory(directory, test_files, extra_str_label, pcen, freq_accuracy, dt, overlap_spectro, w_pix, normalize='', suppress_others=True,
new_data=False):
parent_dir = os.path.dirname(directory)
basename = os.path.basename(directory)
img_db = []
labels_ = pd.DataFrame()
labels = create_label_dataset(directory, extra_str_label=extra_str_label, suppress_others=suppress_others, suppress_noise=False, is_csv=new_data)
# Suppress noise from test positive samples
labels = labels.loc[(labels['filename'].map(lambda x: x not in test_files)) | (labels['bird_id'] != -1)].copy()
### Iterate over wav files
increment_birder = 0
for j, f in enumerate(os.listdir(directory)):
if (f.endswith('.wav')) or (f.endswith('.WAV')):
is_test = f.split('.')[0] in test_files
fp = File_Processor(directory, f, labels)
try:
new_img_db, new_labels_ = fp.process_file(extra_str_label=extra_str_label, freq_accuracy=freq_accuracy, dt=dt, overlap_spectro=overlap_spectro, pcen=pcen,
w_pix=w_pix, normalize=normalize, is_test=is_test)
except:
print(f'File {f} failed, skipping file')
continue
# if the audio file is too long, do not keep the spectrogram in memory but serialize it for the time being
if fp.large_file:
save_path = os.path.join(parent_dir, f'temp_large_file_{basename}_{j}')
with open(save_path, 'wb') as f:
pickle.dump({'img_db': new_img_db, 'labels': new_labels_}, f)
# else simply append to current img db list / labels df
else:
img_db += new_img_db
labels_ = pd.concat([labels_, new_labels_])
increment_birder += 1
return img_db, labels_, increment_birder
class File_Processor:
### Parameters definition
H_PIX = 256 # px
LOW_FREQ = 500 # hz
def __init__(self, directory, file, labels=None):
self.directory = directory
self.file = file
self.labels = labels
def process_file(self, extra_str_label='', freq_accuracy=33.3, dt=0.003, overlap_spectro=0.4, pcen=False, w_pix=512, normalize='', limit_points=757576, is_test=False):
'''
Generates and split spectrogram into images of chosen width, and associate labels to each image under the form of bounding box coordinates
'''
# hop duration & frequence accuracy
self.FREQ_ACCURACY = freq_accuracy
self.DT = dt
wav_path = os.path.join(self.directory, self.file)
self.filename = self.file.split('.')[0].replace(extra_str_label, '')
self.is_test = is_test
# Final images
self.W_PIX = w_pix
self.HOP_SPECTRO = int((1 - overlap_spectro) * self.W_PIX)
# Generate spectrogram
librosa_data, FREQ = self._load(wav_path)
# If sampling frequency is not 44k or 48kHz do not process
if (FREQ != 44100) and (FREQ != 48000):
print(f'Could not process file sampled at {FREQ} hz')
return []
self.FREQ = FREQ
self.WIN_LENGTH = int(self.FREQ / self.FREQ_ACCURACY)
self.HOP_LENGTH = int(self.FREQ * self.DT)
overlap_fft = np.round(1 - self.HOP_LENGTH / self.WIN_LENGTH, 3)
# Actual hop duration & frequence accuracy
self.FREQ_ACCURACY = self.FREQ / self.WIN_LENGTH
self.DT = int((1 - overlap_fft) * self.WIN_LENGTH) / self.FREQ
# Cut low and high freq
self.LOW_IDX = 1 + int(self.LOW_FREQ / self.FREQ_ACCURACY)
self.HIGH_IDX = self.LOW_IDX + self.H_PIX
self.LOW_FREQ = (self.LOW_IDX - 1) * self.FREQ_ACCURACY
self.HIGH_FREQ = (self.HIGH_IDX - 1) * self.FREQ_ACCURACY
if (len(librosa_data) <= limit_points * self.HOP_LENGTH):
self.large_file = False
spectro = self._spectrogram(librosa_data)
power_spec = self._process_spectro(spectro, pcen=pcen, normalize=normalize)
else:
# if file is too long, process data per (overlapping) block, serizalize between blocks and load all block data before the end
self.large_file = True
start_idx, cut_idx, n_overlaps = self._block_spectrogram_idx(librosa_data, limit_points)
t = time.time()
temp_dir = os.getcwd()
for i, (start, end) in enumerate(zip(start_idx, cut_idx)):
temp_file_path = os.path.join(temp_dir, f'power_spec_temp_{i}')
data_cut = librosa_data[start:end]
spectro_cut = self._spectrogram(data_cut)
if i == 0:
spectro_cut = spectro_cut[:, :-n_overlaps]
elif i == len(cut_idx) - 1:
spectro_cut = spectro_cut[:, n_overlaps:]
else:
spectro_cut = spectro_cut[:, n_overlaps:-n_overlaps]
new_power_spec = self._process_spectro(spectro_cut, pcen=pcen, normalize=normalize)
# Save spectrogram block i
with open(temp_file_path, 'wb') as f:
pickle.dump(new_power_spec, f)
print(f'Block {i} processed in {time.time() - t}')
# Now load and concatenate intermediate power spec blocks
power_spec = np.array([])
for i in range(len(start_idx)):
temp_file_path = os.path.join(temp_dir, f'power_spec_temp_{i}')
with open(temp_file_path, 'rb') as f:
new_power_spec = pickle.load(f)
if len(power_spec) == 0:
power_spec = new_power_spec
else:
power_spec = np.concatenate([power_spec, new_power_spec], axis=-1)
# Remove temporary files
remove = [os.remove(os.path.join(temp_dir, f)) for f in os.listdir(temp_dir) if 'power_spec_temp_' in f]
# Record the length of the spectrogram
self.spectrogram_length = power_spec.shape[-1]
# images to append
img_db = self._split_power_spec(power_spec, normalize=normalize)
# labels to append
if self.labels is not None:
labels_ = self._merge_and_filter_labels(img_db)
# If file contains no positive sample then empty list / df is returned
if len(labels_) == 0:
return [], labels_
# Else keep only positive images
positive_idx = labels_['index'].values
positive_img_db = list(itemgetter(*positive_idx)(img_db))
if len(positive_img_db) == 1:
positive_img_db[0] = positive_img_db[0][np.newaxis, :]
return positive_img_db, labels_
else:
return img_db
def _process_spectro(self, spectrogram, pcen, normalize):
spectrogram = spectrogram[self.LOW_IDX:self.HIGH_IDX, :]
# Generate log power spectrogram
power_spec = self._frontend(spectrogram, pcen=pcen)
return power_spec
def _load(self, wav_path):
librosa_data, freq = librosa.load(wav_path, sr=None)
return librosa_data, freq
def _spectrogram(self, data):
spectrogram = librosa.stft(data, win_length=self.WIN_LENGTH, hop_length=self.HOP_LENGTH, n_fft=self.WIN_LENGTH)
return spectrogram
def _block_spectrogram_idx(self, data, limit_points):
"""
Splits librosa data into chunks of defined width. Attention is carried to preserve fft integrity by carefully overlapping successive chunks
(and considering that subsequent librosa stft is centered).
"""
data_len = len(data)
cut_every = limit_points * self.HOP_LENGTH
cut_idx = [int(i * cut_every) for i in np.arange(1, data_len / cut_every)]
# Border effects: how many fft points must be discarded at the junction between cuts because they do not contain entire signal frames
n_overlaps = int((self.WIN_LENGTH - 1) / self.HOP_LENGTH)
start_idx = [0] + [cut - (2 * n_overlaps + 1) * self.HOP_LENGTH for cut in cut_idx]
cut_idx = cut_idx + [data_len]
return start_idx, cut_idx, n_overlaps
def _frontend(self, spectrogram, pcen=False, chunk=600, eps=1e-8):
"""
Computes modulus of spectrogram amplitudes. Spectrogram is first split into 1 hours chunks, and further into smaller
chunks whose log power are concatenated. This appeared to run faster on my pc but can ofc be challenged (esp. the smaller
chunk division).
If pcen is activated, concatenate it to the standard preprocessing as an additional channel
"""
chunk_size = int(chunk / self.DT)
square_power_specs = []
log_power_specs = []
# Split spectrogram into 1 hour chunks
splits = np.array_split(spectrogram, max(1, int(spectrogram.shape[1] * self.DT / 3600)), axis=1)
for i, split in enumerate(splits):
power_spec = np.abs(split[:, :chunk_size])
log_power_spec = np.log(eps + power_spec)
if pcen:
square_power_spec = power_spec**2
for i in range(1, 1 + split.shape[1] // chunk_size):
power_spec = np.abs(split[:, i * chunk_size:(i + 1) * chunk_size])
log_power_spec = np.hstack((log_power_spec, np.log(eps + power_spec)))
if pcen:
square_power_spec = np.hstack((square_power_spec, power_spec**2))
log_power_specs.append(log_power_spec)
if pcen:
square_power_specs.append(square_power_spec)
log_power_spec = np.hstack(log_power_specs)
output = log_power_spec.reshape((1,) + log_power_spec.shape)
# Per-channel Energy Normalization. Parameters from Lostalen et al., default time constant.
if pcen:
for square_power_spec in square_power_specs:
square_power_spec = librosa.pcen(square_power_spec, sr=self.FREQ, hop_length=self.HOP_LENGTH, time_constant=0.4,
gain=0.8, power=0.25, bias=10)
square_power_spec = np.hstack(square_power_specs)
square_power_spec = square_power_spec.reshape((1,) + square_power_spec.shape)
output = np.vstack((output, square_power_spec))
return output
def _split_power_spec(self, log_power_spec, normalize=''):
"""
Splits a spectrogram 2D array along axis=1 given hop size and img width.
"""
if normalize == 'global':
maxi = log_power_spec.max()
log_power_spec = log_power_spec - maxi
# Split into overlapping fixed size images
img_db = [log_power_spec[..., k * self.HOP_SPECTRO: k * self.HOP_SPECTRO + self.W_PIX] for k in range(max(1,
int(1 + np.ceil((log_power_spec.shape[-1] - self.W_PIX) / self.HOP_SPECTRO))))]
if normalize == 'local':
img_db = [subspec - subspec.max() for subspec in img_db]
if img_db[-1].shape[-1] < self.W_PIX:
while img_db[-1].shape[-1] < self.W_PIX:
img = np.concatenate((img_db[-1], img_db[-1][..., -(self.W_PIX - img_db[-1].shape[-1]):]), axis=2)
img_db[-1] = img
return img_db
def _merge_and_filter_labels(self, img_db):
"""
Computes and return a dataframe containing img indexes and a list of bb coordinates for each images in a given file
"""
# Img coordinates in original spectrogram
img_coord = [(i * self.HOP_SPECTRO, i * self.HOP_SPECTRO + self.W_PIX - 1) for i in range(len(img_db))]
img_coord = pd.DataFrame(img_coord).rename(columns={0: 'start', 1:'end'})
# Merge filtered label dataset with each image in collection, keep only annotations that intersect the images
labels_ = self.labels.loc[self.labels['filename'] == self.filename].copy()
if len(labels_) == 0:
labels_ = pd.DataFrame({key: [] for key in ['index', 'coord', 'bird_id', 'filename', 'birder']})
return labels_
# Convert second to pixels given DT, the time equivalent of hop_size
for ex_label, new_label in zip(['t_start', 't_end'], ['x_1', 'x_2']):
labels_[new_label] = (labels_[ex_label].astype(float) / self.DT).astype(int)
# Same for frequencies
for ex_label, new_label in zip(['f_start', 'f_end'], ['y_1', 'y_2']):
labels_[new_label] = ((labels_[ex_label].clip(lower=self.LOW_FREQ, upper=self.HIGH_FREQ) - self.LOW_FREQ) / self.FREQ_ACCURACY).astype(int)
labels_ = labels_.loc[labels_['y_1'] != labels_['y_2']]
labels_.index = range(len(labels_))
labels_['w'] = labels_['x_2'] - labels_['x_1'] + 1
labels_['h'] = labels_['y_2'] - labels_['y_1'] + 1
for size in ['w', 'h']:
labels_ = labels_.loc[labels_[size] > 0]
labels_['joint'] = 1
img_coord['joint'] = 1
img_coord.reset_index(inplace=True)
coord = ['x_1', 'y_1', 'x_2', 'y_2']
labels_ = labels_[coord + ['w', 'h', 'joint', 'bird_id']].merge(img_coord, on='joint')
labels_ = labels_.loc[(labels_['x_1'].between(labels_['start'], labels_['end'])) | (labels_['x_2'].between(labels_['start'], labels_['end']))]
# Supress bbox with too small intersection with spectrogram
labels_['inside'] = labels_[['x_2', 'end']].min(axis=1) - labels_[['x_1', 'start']].max(axis=1) + 1
cond_1 = (labels_['inside'] < 0.5 * labels_['w']) & (labels_['inside'] < 15)
cond_2 = (labels_['inside'] < 0.1 * labels_['w'])
labels_ = labels_.loc[~(cond_1 | cond_2)]
# Bounding boxes are expanded 10% in every direction
labels_['x_1'] = (labels_['x_1'] - labels_['start'] - (labels_['w'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(lower=0)
labels_['x_2'] = (labels_['x_2'] - labels_['start'] + (labels_['w'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(upper=self.W_PIX - 1)
labels_['y_1'] = (labels_['y_1'] - (labels_['h'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(lower=0)
labels_['y_2'] = (labels_['y_2'] + (labels_['h'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(upper=self.H_PIX - 1)
labels_['w'] = labels_['x_2'] - labels_['x_1']
labels_['h'] = labels_['y_2'] - labels_['y_1']
labels_['coord'] = [(x_1, y_1, x_2, y_2) for (x_1, y_1, x_2, y_2) in zip(labels_['x_1'], labels_['y_1'],
labels_['x_2'], labels_['y_2'])]
# Delete negative samples if they appear in a positive image
labels_ = labels_.merge(labels_.loc[labels_['bird_id'] != -1].groupby('index').size().reset_index().rename(columns={0: 'count'}), on='index')
labels_ = labels_.loc[(labels_['bird_id'] != -1) | (labels_['count'] == 0)]
# One row per img
labels_ = labels_.groupby('index', as_index=False).agg({'coord': lambda x: x.tolist(), 'bird_id': lambda x: x.tolist()})
# Complete with negative samples
labels_ = pd.merge(
pd.DataFrame({'index': range(len(img_db))}, index=range(len(img_db))),
labels_,
how='outer',
on='index'
)
if not self.is_test:
# Sample negative images (10 % of total positive image count)
positive_count = labels_['coord'].isnull().value_counts()
if True in positive_count.index:
n_negative = positive_count[True]
else:
n_negative = 0
if False in positive_count.index:
n_positive = positive_count[False]
else:
n_positive = 0
negative_idx = labels_.loc[labels_['coord'].isnull(), 'index'].values
if len(negative_idx) > 0:
negative_idx = sorted(np.random.choice(negative_idx, min(int(n_positive * 0.1), 20)))
# Fill negative images coord column with empty lists
labels_.loc[negative_idx, 'coord'] = labels_.loc[negative_idx, 'coord'].map(lambda x: [(-1, -1, -1, -1)])
# Fill negative images bird_id with [-1]
labels_.loc[negative_idx, 'bird_id'] = labels_.loc[negative_idx, 'bird_id'].map(lambda x: [-1])
labels_ = labels_.loc[labels_['coord'].notnull()].copy()
# Replace noise samples coord with -1 list
noise_idx = labels_['bird_id'].map(lambda x: -1 in x)
labels_.loc[noise_idx, 'coord'] = labels_.loc[noise_idx, 'coord'].map(lambda x: [(-1, -1, -1, -1)])
labels_['filename'] = self.filename
labels_['birder'] = self.directory.split('\\')[-1]
return labels_