Spaces:

ppeyret
/

nbm_v1

Build error

nbm_v1 / prepare_dataset.py

unknown

add app files

82b8273 over 2 years ago

25.5 kB

	import numpy as np
	import os
	import time
	from utils import *
	import librosa
	import h5py
	import pickle
	from operator import itemgetter


	ornithos = {
	'NidalIssa': {
	'extra_label': ''
	},
	'KevinLeveque': {
	'extra_label': ''
	},
	'HerveRenaudineau': {
	'extra_label': ''
	},
	'GuillaumeBigayon': {
	'extra_label': ''
	},
	'GhislainRiou': {
	'extra_label': ''
	},
	'GaëtanMineau': {
	'extra_label': ''
	},
	'FredericCazaban': {
	'extra_label': ''
	},
	'ChristopheMercier': {
	'extra_label': ''
	},
	'AymericMousseau': {
	'extra_label': 'amousseau_'
	},
	'AdrienPajot': {
	'extra_label': ''
	},
	'WillyRaitiere': {
	'extra_label': 'willyraitiere_'
	},
	'MaxencePajot': {
	'extra_label': 'Piste de marqueur'
	},
	'MathurinAubry': {
	'extra_label': ''
	},
	'LionelManceau': {
	'extra_label': ''
	}
	}


	def prepare_dataset(parent_directory, test_files, export_negative=False, pcen=False, freq_accuracy=33.3, dt=0.003, overlap_spectro=0.4, w_pix=512, extra_label='',
	split=None, normalize='', suppress_others=True, new_data=False):
	"""
	Loops over birder directories, calling process_birder_directory at turns. This creates a spectrogram img dataset of 256*512 img size, along with
	a list of annotations under the form of bounding box coordinates around each bird calls in the images. Additional info is saved along bb coordinates,
	such as birder and file name.
	"""
	if split is not None:
	if split == 1:
	ornithos = {
	'NidalIssa': {
	'extra_label': ''
	},
	'KevinLeveque': {
	'extra_label': ''
	},
	'HerveRenaudineau': {
	'extra_label': ''
	},
	'GuillaumeBigayon': {
	'extra_label': ''
	},
	'GhislainRiou': {
	'extra_label': ''
	},
	'GaëtanMineau': {
	'extra_label': ''
	}
	}

	extra_label += '_one'

	elif split == 2:

	ornithos = {
	'WillyRaitiere': {
	'extra_label': 'willyraitiere_'
	}
	}

	extra_label += '_two'

	else:

	ornithos = {
	'AdrienPajot': {
	'extra_label': ''
	},
	'MaxencePajot': {
	'extra_label': 'Piste de marqueur'
	},
	'MathurinAubry': {
	'extra_label': ''
	},
	'LionelManceau': {
	'extra_label': ''
	},
	'FredericCazaban': {
	'extra_label': ''
	},
	'ChristopheMercier': {
	'extra_label': ''
	},
	'AymericMousseau': {
	'extra_label': 'amousseau_'
	}
	}

	extra_label += '_three'

	if new_data:
	ornithos = {
	'mediae': {'extra_label': ''}
	}
	extra_label += '_2'

	t = time.time()

	img_db = []
	annotations = pd.DataFrame()

	for idx, dic in ornithos.items():

	directory = os.path.join(parent_directory, idx)
	extra_str_label = dic['extra_label']

	birder_img_db, birder_annotations, increment_birder = _process_birder_directory(directory, test_files, extra_str_label, pcen, freq_accuracy, dt, overlap_spectro,
	w_pix, normalize, suppress_others, new_data)
	img_db += birder_img_db
	annotations = pd.concat([annotations, birder_annotations])

	print(f'\n Ornitho: {idx}, time: {time.time() - t}, n_files: {increment_birder}')

	print('* Loading large files info *')

	for f in os.listdir(parent_directory):
	if 'temp_large_file' in f:
	with open(os.path.join(parent_directory, f), 'rb') as file:
	large_file_dict = pickle.load(file)
	img_db += large_file_dict['img_db']
	annotations = pd.concat([annotations, large_file_dict['labels']])
	os.remove(os.path.join(parent_directory, f))

	print('* Extracting positive and negative samples *')

	annotations['index'] = range(len(annotations))

	# positive_annotations = annotations.loc[annotations['coord'].notnull()].copy()

	# Stack positive and test samples into a single 4D array
	if len(test_files) > 0:
	test_annotations = annotations.loc[(annotations['coord'].notnull()) & (annotations['filename'].isin(test_files))].sort_values('index').copy()
	test_idx = test_annotations['index'].values
	test_img = np.stack(itemgetter(*test_idx)(img_db))
	train_annotations = annotations.loc[~annotations['filename'].isin(test_files)].sort_values('index').copy()
	train_idx = train_annotations['index'].values
	train_img = np.stack(itemgetter(*train_idx)(img_db))

	print(f'Extraction done in {time.time() - t}, serializing')

	# Serialization of positive & test samples
	dict_to_serialize = {
	'train':{
	'img_db': train_img,
	'labels': train_annotations
	}
	}

	if len(test_files) > 0:
	dict_to_serialize.update({
	'test':{
	'img_db': test_img,
	'labels': test_annotations
	}
	})

	# Serialization of positive & test samples

	for key, value in dict_to_serialize.items():

	with h5py.File(os.path.join(parent_directory, f'{key}_img_db{extra_label}.hdf5'), 'w') as f:

	f.create_dataset('img_db', data=value['img_db'])

	with h5py.File(os.path.join(parent_directory, f'{key}_annotations{extra_label}.hdf5'), 'w') as f:

	for idx in range(len(value['labels'])):

	grp = f.create_group(str(idx))
	subds = value['labels'].iloc[idx]
	bb_coord = np.vstack(subds.coord)
	grp.create_dataset('bb_coord', data=bb_coord)

	for key in ['bird_id', 'filename', 'birder']:
	grp.create_dataset(key, data=subds[key])

	print(f'** done serializing positive samples, now mb turn of negative samples: {time.time() - t}')

	# Serialization of negative test samples
	if len(test_files) > 0:

	negative_samples = annotations.loc[(annotations['coord'].isnull()) & (annotations['filename'].isin(test_files))].sort_values('index').copy()
	negative_idx = negative_samples['index'].values

	negative_img = np.stack(itemgetter(*negative_idx)(img_db))

	with h5py.File(os.path.join(parent_directory, f'test_negative_img_db{extra_label}.hdf5'), 'w') as f:

	f.create_dataset('img_db', data=negative_img)

	print(f'** done serializing negative test samples: {time.time() - t}')

	# Serialization of negative samples
	if export_negative:

	negative_samples = annotations.loc[annotations['coord'].isnull()].copy()
	negative_idx = negative_samples['index'].values

	negative_img = np.stack(itemgetter(*negative_idx)(img_db))

	with h5py.File(os.path.join(parent_directory, 'negative_img_db.hdf5'), 'w') as f:

	f.create_dataset('img_db', data=negative_img)

	print(f'** done serializing negative_samples samples: {time.time() - t}')

	print(f'Done in {time.time() - t}')


	def process_data_directory(directory, pcen, freq_accuracy, dt, overlap_spectro, w_pix, normalize=''):

	img_db = []

	### Iterate over wav files

	increment = 0
	failed = 0

	for f in os.listdir(directory):
	if (f.endswith('.wav')) or (f.endswith('.WAV')):

	fp = File_Processor(directory, f)
	new_img_db = fp.process_file(pcen=pcen, freq_accuracy=freq_accuracy, dt=dt, overlap_spectro=overlap_spectro, w_pix=w_pix, normalize=normalize)
	if len(new_img_db) == 0:
	failed += 1
	img_db += new_img_db

	increment += 1

	return img_db, increment, failed


	def _process_birder_directory(directory, test_files, extra_str_label, pcen, freq_accuracy, dt, overlap_spectro, w_pix, normalize='', suppress_others=True,
	new_data=False):

	parent_dir = os.path.dirname(directory)
	basename = os.path.basename(directory)

	img_db = []
	labels_ = pd.DataFrame()

	labels = create_label_dataset(directory, extra_str_label=extra_str_label, suppress_others=suppress_others, suppress_noise=False, is_csv=new_data)
	# Suppress noise from test positive samples
	labels = labels.loc[(labels['filename'].map(lambda x: x not in test_files)) \| (labels['bird_id'] != -1)].copy()

	### Iterate over wav files

	increment_birder = 0

	for j, f in enumerate(os.listdir(directory)):
	if (f.endswith('.wav')) or (f.endswith('.WAV')):

	is_test = f.split('.')[0] in test_files

	fp = File_Processor(directory, f, labels)
	try:
	new_img_db, new_labels_ = fp.process_file(extra_str_label=extra_str_label, freq_accuracy=freq_accuracy, dt=dt, overlap_spectro=overlap_spectro, pcen=pcen,
	w_pix=w_pix, normalize=normalize, is_test=is_test)
	except:
	print(f'File {f} failed, skipping file')
	continue

	# if the audio file is too long, do not keep the spectrogram in memory but serialize it for the time being
	if fp.large_file:
	save_path = os.path.join(parent_dir, f'temp_large_file_{basename}_{j}')
	with open(save_path, 'wb') as f:
	pickle.dump({'img_db': new_img_db, 'labels': new_labels_}, f)
	# else simply append to current img db list / labels df
	else:
	img_db += new_img_db
	labels_ = pd.concat([labels_, new_labels_])

	increment_birder += 1

	return img_db, labels_, increment_birder


	class File_Processor:

	### Parameters definition

	H_PIX = 256 # px
	LOW_FREQ = 500 # hz

	def __init__(self, directory, file, labels=None):

	self.directory = directory
	self.file = file
	self.labels = labels


	def process_file(self, extra_str_label='', freq_accuracy=33.3, dt=0.003, overlap_spectro=0.4, pcen=False, w_pix=512, normalize='', limit_points=757576, is_test=False):
	'''
	Generates and split spectrogram into images of chosen width, and associate labels to each image under the form of bounding box coordinates
	'''

	# hop duration & frequence accuracy
	self.FREQ_ACCURACY = freq_accuracy
	self.DT = dt

	wav_path = os.path.join(self.directory, self.file)
	self.filename = self.file.split('.')[0].replace(extra_str_label, '')
	self.is_test = is_test

	# Final images
	self.W_PIX = w_pix
	self.HOP_SPECTRO = int((1 - overlap_spectro) * self.W_PIX)

	# Generate spectrogram
	librosa_data, FREQ = self._load(wav_path)
	# If sampling frequency is not 44k or 48kHz do not process
	if (FREQ != 44100) and (FREQ != 48000):
	print(f'Could not process file sampled at {FREQ} hz')
	return []
	self.FREQ = FREQ
	self.WIN_LENGTH = int(self.FREQ / self.FREQ_ACCURACY)
	self.HOP_LENGTH = int(self.FREQ * self.DT)
	overlap_fft = np.round(1 - self.HOP_LENGTH / self.WIN_LENGTH, 3)

	# Actual hop duration & frequence accuracy
	self.FREQ_ACCURACY = self.FREQ / self.WIN_LENGTH
	self.DT = int((1 - overlap_fft) * self.WIN_LENGTH) / self.FREQ

	# Cut low and high freq
	self.LOW_IDX = 1 + int(self.LOW_FREQ / self.FREQ_ACCURACY)
	self.HIGH_IDX = self.LOW_IDX + self.H_PIX

	self.LOW_FREQ = (self.LOW_IDX - 1) * self.FREQ_ACCURACY
	self.HIGH_FREQ = (self.HIGH_IDX - 1) * self.FREQ_ACCURACY

	if (len(librosa_data) <= limit_points * self.HOP_LENGTH):
	self.large_file = False
	spectro = self._spectrogram(librosa_data)
	power_spec = self._process_spectro(spectro, pcen=pcen, normalize=normalize)
	else:
	# if file is too long, process data per (overlapping) block, serizalize between blocks and load all block data before the end
	self.large_file = True
	start_idx, cut_idx, n_overlaps = self._block_spectrogram_idx(librosa_data, limit_points)

	t = time.time()
	temp_dir = os.getcwd()

	for i, (start, end) in enumerate(zip(start_idx, cut_idx)):

	temp_file_path = os.path.join(temp_dir, f'power_spec_temp_{i}')
	data_cut = librosa_data[start:end]
	spectro_cut = self._spectrogram(data_cut)
	if i == 0:
	spectro_cut = spectro_cut[:, :-n_overlaps]
	elif i == len(cut_idx) - 1:
	spectro_cut = spectro_cut[:, n_overlaps:]
	else:
	spectro_cut = spectro_cut[:, n_overlaps:-n_overlaps]

	new_power_spec = self._process_spectro(spectro_cut, pcen=pcen, normalize=normalize)

	# Save spectrogram block i
	with open(temp_file_path, 'wb') as f:
	pickle.dump(new_power_spec, f)

	print(f'Block {i} processed in {time.time() - t}')

	# Now load and concatenate intermediate power spec blocks
	power_spec = np.array([])

	for i in range(len(start_idx)):

	temp_file_path = os.path.join(temp_dir, f'power_spec_temp_{i}')
	with open(temp_file_path, 'rb') as f:
	new_power_spec = pickle.load(f)

	if len(power_spec) == 0:
	power_spec = new_power_spec
	else:
	power_spec = np.concatenate([power_spec, new_power_spec], axis=-1)

	# Remove temporary files
	remove = [os.remove(os.path.join(temp_dir, f)) for f in os.listdir(temp_dir) if 'power_spec_temp_' in f]

	# Record the length of the spectrogram
	self.spectrogram_length = power_spec.shape[-1]

	# images to append
	img_db = self._split_power_spec(power_spec, normalize=normalize)

	# labels to append
	if self.labels is not None:
	labels_ = self._merge_and_filter_labels(img_db)
	# If file contains no positive sample then empty list / df is returned
	if len(labels_) == 0:
	return [], labels_
	# Else keep only positive images
	positive_idx = labels_['index'].values
	positive_img_db = list(itemgetter(*positive_idx)(img_db))
	if len(positive_img_db) == 1:
	positive_img_db[0] = positive_img_db[0][np.newaxis, :]
	return positive_img_db, labels_
	else:
	return img_db


	def _process_spectro(self, spectrogram, pcen, normalize):

	spectrogram = spectrogram[self.LOW_IDX:self.HIGH_IDX, :]

	# Generate log power spectrogram
	power_spec = self._frontend(spectrogram, pcen=pcen)

	return power_spec


	def _load(self, wav_path):

	librosa_data, freq = librosa.load(wav_path, sr=None)

	return librosa_data, freq


	def _spectrogram(self, data):

	spectrogram = librosa.stft(data, win_length=self.WIN_LENGTH, hop_length=self.HOP_LENGTH, n_fft=self.WIN_LENGTH)

	return spectrogram


	def _block_spectrogram_idx(self, data, limit_points):
	"""
	Splits librosa data into chunks of defined width. Attention is carried to preserve fft integrity by carefully overlapping successive chunks
	(and considering that subsequent librosa stft is centered).
	"""

	data_len = len(data)
	cut_every = limit_points * self.HOP_LENGTH
	cut_idx = [int(i * cut_every) for i in np.arange(1, data_len / cut_every)]
	# Border effects: how many fft points must be discarded at the junction between cuts because they do not contain entire signal frames
	n_overlaps = int((self.WIN_LENGTH - 1) / self.HOP_LENGTH)
	start_idx = [0] + [cut - (2 * n_overlaps + 1) * self.HOP_LENGTH for cut in cut_idx]
	cut_idx = cut_idx + [data_len]

	return start_idx, cut_idx, n_overlaps


	def _frontend(self, spectrogram, pcen=False, chunk=600, eps=1e-8):
	"""
	Computes modulus of spectrogram amplitudes. Spectrogram is first split into 1 hours chunks, and further into smaller
	chunks whose log power are concatenated. This appeared to run faster on my pc but can ofc be challenged (esp. the smaller
	chunk division).
	If pcen is activated, concatenate it to the standard preprocessing as an additional channel
	"""

	chunk_size = int(chunk / self.DT)
	square_power_specs = []
	log_power_specs = []

	# Split spectrogram into 1 hour chunks
	splits = np.array_split(spectrogram, max(1, int(spectrogram.shape[1] * self.DT / 3600)), axis=1)

	for i, split in enumerate(splits):

	power_spec = np.abs(split[:, :chunk_size])
	log_power_spec = np.log(eps + power_spec)
	if pcen:
	square_power_spec = power_spec**2

	for i in range(1, 1 + split.shape[1] // chunk_size):

	power_spec = np.abs(split[:, i * chunk_size:(i + 1) * chunk_size])
	log_power_spec = np.hstack((log_power_spec, np.log(eps + power_spec)))
	if pcen:
	square_power_spec = np.hstack((square_power_spec, power_spec**2))

	log_power_specs.append(log_power_spec)
	if pcen:
	square_power_specs.append(square_power_spec)

	log_power_spec = np.hstack(log_power_specs)
	output = log_power_spec.reshape((1,) + log_power_spec.shape)

	# Per-channel Energy Normalization. Parameters from Lostalen et al., default time constant.

	if pcen:
	for square_power_spec in square_power_specs:
	square_power_spec = librosa.pcen(square_power_spec, sr=self.FREQ, hop_length=self.HOP_LENGTH, time_constant=0.4,
	gain=0.8, power=0.25, bias=10)
	square_power_spec = np.hstack(square_power_specs)
	square_power_spec = square_power_spec.reshape((1,) + square_power_spec.shape)
	output = np.vstack((output, square_power_spec))

	return output


	def _split_power_spec(self, log_power_spec, normalize=''):
	"""
	Splits a spectrogram 2D array along axis=1 given hop size and img width.
	"""

	if normalize == 'global':
	maxi = log_power_spec.max()
	log_power_spec = log_power_spec - maxi

	# Split into overlapping fixed size images
	img_db = [log_power_spec[..., k * self.HOP_SPECTRO: k * self.HOP_SPECTRO + self.W_PIX] for k in range(max(1,
	int(1 + np.ceil((log_power_spec.shape[-1] - self.W_PIX) / self.HOP_SPECTRO))))]

	if normalize == 'local':
	img_db = [subspec - subspec.max() for subspec in img_db]

	if img_db[-1].shape[-1] < self.W_PIX:

	while img_db[-1].shape[-1] < self.W_PIX:
	img = np.concatenate((img_db[-1], img_db[-1][..., -(self.W_PIX - img_db[-1].shape[-1]):]), axis=2)
	img_db[-1] = img

	return img_db


	def _merge_and_filter_labels(self, img_db):
	"""
	Computes and return a dataframe containing img indexes and a list of bb coordinates for each images in a given file
	"""

	# Img coordinates in original spectrogram
	img_coord = [(i * self.HOP_SPECTRO, i * self.HOP_SPECTRO + self.W_PIX - 1) for i in range(len(img_db))]
	img_coord = pd.DataFrame(img_coord).rename(columns={0: 'start', 1:'end'})

	# Merge filtered label dataset with each image in collection, keep only annotations that intersect the images
	labels_ = self.labels.loc[self.labels['filename'] == self.filename].copy()

	if len(labels_) == 0:
	labels_ = pd.DataFrame({key: [] for key in ['index', 'coord', 'bird_id', 'filename', 'birder']})
	return labels_

	# Convert second to pixels given DT, the time equivalent of hop_size
	for ex_label, new_label in zip(['t_start', 't_end'], ['x_1', 'x_2']):
	labels_[new_label] = (labels_[ex_label].astype(float) / self.DT).astype(int)

	# Same for frequencies
	for ex_label, new_label in zip(['f_start', 'f_end'], ['y_1', 'y_2']):
	labels_[new_label] = ((labels_[ex_label].clip(lower=self.LOW_FREQ, upper=self.HIGH_FREQ) - self.LOW_FREQ) / self.FREQ_ACCURACY).astype(int)

	labels_ = labels_.loc[labels_['y_1'] != labels_['y_2']]
	labels_.index = range(len(labels_))

	labels_['w'] = labels_['x_2'] - labels_['x_1'] + 1
	labels_['h'] = labels_['y_2'] - labels_['y_1'] + 1

	for size in ['w', 'h']:
	labels_ = labels_.loc[labels_[size] > 0]

	labels_['joint'] = 1
	img_coord['joint'] = 1
	img_coord.reset_index(inplace=True)

	coord = ['x_1', 'y_1', 'x_2', 'y_2']
	labels_ = labels_[coord + ['w', 'h', 'joint', 'bird_id']].merge(img_coord, on='joint')
	labels_ = labels_.loc[(labels_['x_1'].between(labels_['start'], labels_['end'])) \| (labels_['x_2'].between(labels_['start'], labels_['end']))]

	# Supress bbox with too small intersection with spectrogram
	labels_['inside'] = labels_[['x_2', 'end']].min(axis=1) - labels_[['x_1', 'start']].max(axis=1) + 1

	cond_1 = (labels_['inside'] < 0.5 * labels_['w']) & (labels_['inside'] < 15)
	cond_2 = (labels_['inside'] < 0.1 * labels_['w'])

	labels_ = labels_.loc[~(cond_1 \| cond_2)]

	# Bounding boxes are expanded 10% in every direction
	labels_['x_1'] = (labels_['x_1'] - labels_['start'] - (labels_['w'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(lower=0)
	labels_['x_2'] = (labels_['x_2'] - labels_['start'] + (labels_['w'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(upper=self.W_PIX - 1)
	labels_['y_1'] = (labels_['y_1'] - (labels_['h'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(lower=0)
	labels_['y_2'] = (labels_['y_2'] + (labels_['h'] * 0.1).astype(int).clip(lower=3, upper=6)).clip(upper=self.H_PIX - 1)

	labels_['w'] = labels_['x_2'] - labels_['x_1']
	labels_['h'] = labels_['y_2'] - labels_['y_1']

	labels_['coord'] = [(x_1, y_1, x_2, y_2) for (x_1, y_1, x_2, y_2) in zip(labels_['x_1'], labels_['y_1'],
	labels_['x_2'], labels_['y_2'])]

	# Delete negative samples if they appear in a positive image
	labels_ = labels_.merge(labels_.loc[labels_['bird_id'] != -1].groupby('index').size().reset_index().rename(columns={0: 'count'}), on='index')
	labels_ = labels_.loc[(labels_['bird_id'] != -1) \| (labels_['count'] == 0)]

	# One row per img
	labels_ = labels_.groupby('index', as_index=False).agg({'coord': lambda x: x.tolist(), 'bird_id': lambda x: x.tolist()})

	# Complete with negative samples
	labels_ = pd.merge(
	pd.DataFrame({'index': range(len(img_db))}, index=range(len(img_db))),
	labels_,
	how='outer',
	on='index'
	)

	if not self.is_test:

	# Sample negative images (10 % of total positive image count)
	positive_count = labels_['coord'].isnull().value_counts()
	if True in positive_count.index:
	n_negative = positive_count[True]
	else:
	n_negative = 0
	if False in positive_count.index:
	n_positive = positive_count[False]
	else:
	n_positive = 0

	negative_idx = labels_.loc[labels_['coord'].isnull(), 'index'].values
	if len(negative_idx) > 0:
	negative_idx = sorted(np.random.choice(negative_idx, min(int(n_positive * 0.1), 20)))
	# Fill negative images coord column with empty lists
	labels_.loc[negative_idx, 'coord'] = labels_.loc[negative_idx, 'coord'].map(lambda x: [(-1, -1, -1, -1)])
	# Fill negative images bird_id with [-1]
	labels_.loc[negative_idx, 'bird_id'] = labels_.loc[negative_idx, 'bird_id'].map(lambda x: [-1])
	labels_ = labels_.loc[labels_['coord'].notnull()].copy()
	# Replace noise samples coord with -1 list
	noise_idx = labels_['bird_id'].map(lambda x: -1 in x)
	labels_.loc[noise_idx, 'coord'] = labels_.loc[noise_idx, 'coord'].map(lambda x: [(-1, -1, -1, -1)])

	labels_['filename'] = self.filename
	labels_['birder'] = self.directory.split('\\')[-1]

	return labels_