import pandas as pd import json import os from subprocess import call from tqdm import tqdm train_path = '/home/mila/c/chris.emezue/naijavoices-research/abraham/data/train.csv' AUDIO_DIR = '/home/mila/c/chris.emezue/scratch/naijavoices-data/naijavoices-dataset-compressed/audio-unconverted' DESTINATION_AUDIO_PATH = '/home/mila/c/chris.emezue/naijavoices-research/sample_audios' df_train = pd.read_csv(train_path) def get_sample_audios_per_language(df,language: str,n_sample: int=30): df_language = df[df['language']==language] df_sampled = df_language.sample(n = n_sample) audio_paths = df_sampled['audio'].values.tolist() transcripts = df_sampled['text'].values.tolist() audio_paths = [(os.path.join(AUDIO_DIR,a),t,language) for t,a in zip(transcripts,audio_paths)] return audio_paths ig_samples = get_sample_audios_per_language(df_train,'igbo') yo_samples = get_sample_audios_per_language(df_train,'yoruba') ha_samples = get_sample_audios_per_language(df_train,'hausa') all_samples = ig_samples + yo_samples + ha_samples breakpoint() def copy_files(file,transcript,language): audio_path_copy_to = os.path.join(DESTINATION_AUDIO_PATH,language) basename = os.path.basename(file) copy_to_audio_path = os.path.join(audio_path_copy_to,basename) cmd = ['cp', file, copy_to_audio_path] call(cmd) for sample in tqdm(all_samples,desc='Copying files...'): copy_files(sample[0],sample[1],sample[2]) audio_names = [os.path.basename(a[0]) for a in all_samples] texts = [a[1] for a in all_samples] langs = [a[2] for a in all_samples] df = pd.DataFrame({'audio':audio_names,'transcript':texts,'language':langs}) df.to_csv('sample_audios.csv',index=False)