File size: 1,702 Bytes
5a0849d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
import json
import os
from subprocess import call
from tqdm import tqdm


train_path = '/home/mila/c/chris.emezue/naijavoices-research/abraham/data/train.csv'
AUDIO_DIR = '/home/mila/c/chris.emezue/scratch/naijavoices-data/naijavoices-dataset-compressed/audio-unconverted'
DESTINATION_AUDIO_PATH = '/home/mila/c/chris.emezue/naijavoices-research/sample_audios'
df_train = pd.read_csv(train_path)

def get_sample_audios_per_language(df,language: str,n_sample: int=30):
    df_language = df[df['language']==language]
    df_sampled = df_language.sample(n = n_sample)
    audio_paths = df_sampled['audio'].values.tolist()
    transcripts = df_sampled['text'].values.tolist()
    audio_paths = [(os.path.join(AUDIO_DIR,a),t,language) for t,a in zip(transcripts,audio_paths)]
    return audio_paths

ig_samples = get_sample_audios_per_language(df_train,'igbo')
yo_samples = get_sample_audios_per_language(df_train,'yoruba')
ha_samples = get_sample_audios_per_language(df_train,'hausa')

all_samples = ig_samples + yo_samples + ha_samples

breakpoint()
def copy_files(file,transcript,language):
    audio_path_copy_to = os.path.join(DESTINATION_AUDIO_PATH,language)
    basename = os.path.basename(file)
    copy_to_audio_path = os.path.join(audio_path_copy_to,basename)
    cmd = ['cp', file, copy_to_audio_path]
    call(cmd)


for sample in tqdm(all_samples,desc='Copying files...'):
    copy_files(sample[0],sample[1],sample[2])



audio_names = [os.path.basename(a[0]) for a in all_samples]
texts = [a[1] for a in all_samples]
langs = [a[2] for a in all_samples]

df = pd.DataFrame({'audio':audio_names,'transcript':texts,'language':langs})
df.to_csv('sample_audios.csv',index=False)