File size: 3,586 Bytes
3df661a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
# coding: utf-8

# # **Fine-tuning XLSR-Wav2Vec2 for Multi-Lingual ASR with 🤗 Transformers**

# In this notebook, I attempted to Fine-tune XLSR Wav2Vec 2.0 with the French Automatic Speech Recognition task. I have used the speech data from Common Voice Corpus 11.0. Due to the fact this fine-tuning requires heavy computational power, I used 1200 audio clips for this task. 

# In[49]:


# import datasets

# while True:
#     try:
#         datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', split='test', use_auth_token=True)
#         break
#     except:
#         print("Continue")
    
dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True)


# In[34]:


import pandas as pd
import shutil
import os
from tqdm import tqdm


split = ['test']
count = 0
for s in split:
    
    tdir = f'./Dataset/{s}/'
    if not os.path.exists(tdir):
        if not os.path.exists('./Dataset'):
            os.mkdir('./Dataset')
        os.mkdir(tdir)

    files = dataset[s]['path']
    
    
    for sfile in tqdm(files):
        filename = sfile.split('/')[-1]
        dfile = os.path.join(tdir,filename)
        try:
            shutil.copy(sfile,dfile)
        except Exception as e:
            count+=1

print(count)


# ## Convert mp3 files to wav

# In[38]:


import pandas as pd
from pydub import AudioSegment
from joblib import Parallel, delayed

def convert(path,tdir):
       
    try:
        s = os.path.join(tdir,path)
        d = os.path.join(tdir,path.replace('.mp3','.wav'))
        # convert mp3 to wav                                                            
        sound = AudioSegment.from_mp3(s)
        sound.export(d, format="wav")

        os.remove(s)
    except:
        return
    
    

split = ['test']

for s in split:
    
    tdir = f'./Dataset/{s}/'
    files = os.listdir(tdir)
    
    Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files)
    


# In[89]:


import os
split = ['test']
for s in split:
    
    
    tdir = f'./Dataset/{s}/'
    files = os.listdir(tdir)
    files = [file.replace('.wav','.mp3') for file in files]
    data = [row for row in dataset[s] if row['path'].split('/')[-1] in files]
    
    df = pd.DataFrame(data)
    df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav'))
    df.to_csv(f'./Dataset/{s}.csv',index=False)


# ## Clean Data

# In[90]:


Set = 'test'
file = f'./Dataset/{Set}.csv'
df = pd.read_csv(file)


# ### Remove data with unwanted characters

# In[91]:


pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å"
pattern = pattern.replace(" ","|")
pattern


# In[92]:


import re
idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ]
df = df.drop(labels=idx, axis=0)
saved_file_path = f'Dataset/{Set}-clean.csv'
df.to_csv(saved_file_path,index=False)
print(f"Number of data Removed: {len(idx)}")


# ### Remove data items with null speech array

# In[99]:


import soundfile as sf

path = f'./Dataset/{Set}'
files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0]

df = pd.read_csv(f'Dataset/{Set}-clean.csv')
idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files]
df = df.drop(labels=idx, axis=0)
df.to_csv(f'Dataset/{Set}-clean.csv',index=False)
print(f"Number of data Removed: {len(idx)}")


# In[ ]: