Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import json | |
| import os | |
| from subprocess import call | |
| from tqdm import tqdm | |
| train_path = '/home/mila/c/chris.emezue/naijavoices-research/abraham/data/train.csv' | |
| AUDIO_DIR = '/home/mila/c/chris.emezue/scratch/naijavoices-data/naijavoices-dataset-compressed/audio-unconverted' | |
| DESTINATION_AUDIO_PATH = '/home/mila/c/chris.emezue/naijavoices-research/sample_audios' | |
| df_train = pd.read_csv(train_path) | |
| def get_sample_audios_per_language(df,language: str,n_sample: int=30): | |
| df_language = df[df['language']==language] | |
| df_sampled = df_language.sample(n = n_sample) | |
| audio_paths = df_sampled['audio'].values.tolist() | |
| transcripts = df_sampled['text'].values.tolist() | |
| audio_paths = [(os.path.join(AUDIO_DIR,a),t,language) for t,a in zip(transcripts,audio_paths)] | |
| return audio_paths | |
| ig_samples = get_sample_audios_per_language(df_train,'igbo') | |
| yo_samples = get_sample_audios_per_language(df_train,'yoruba') | |
| ha_samples = get_sample_audios_per_language(df_train,'hausa') | |
| all_samples = ig_samples + yo_samples + ha_samples | |
| breakpoint() | |
| def copy_files(file,transcript,language): | |
| audio_path_copy_to = os.path.join(DESTINATION_AUDIO_PATH,language) | |
| basename = os.path.basename(file) | |
| copy_to_audio_path = os.path.join(audio_path_copy_to,basename) | |
| cmd = ['cp', file, copy_to_audio_path] | |
| call(cmd) | |
| for sample in tqdm(all_samples,desc='Copying files...'): | |
| copy_files(sample[0],sample[1],sample[2]) | |
| audio_names = [os.path.basename(a[0]) for a in all_samples] | |
| texts = [a[1] for a in all_samples] | |
| langs = [a[2] for a in all_samples] | |
| df = pd.DataFrame({'audio':audio_names,'transcript':texts,'language':langs}) | |
| df.to_csv('sample_audios.csv',index=False) |