File size: 3,586 Bytes
3df661a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
#!/usr/bin/env python
# coding: utf-8
# # **Fine-tuning XLSR-Wav2Vec2 for Multi-Lingual ASR with 🤗 Transformers**
# In this notebook, I attempted to Fine-tune XLSR Wav2Vec 2.0 with the French Automatic Speech Recognition task. I have used the speech data from Common Voice Corpus 11.0. Due to the fact this fine-tuning requires heavy computational power, I used 1200 audio clips for this task.
# In[49]:
# import datasets
# while True:
# try:
# datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', split='test', use_auth_token=True)
# break
# except:
# print("Continue")
dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True)
# In[34]:
import pandas as pd
import shutil
import os
from tqdm import tqdm
split = ['test']
count = 0
for s in split:
tdir = f'./Dataset/{s}/'
if not os.path.exists(tdir):
if not os.path.exists('./Dataset'):
os.mkdir('./Dataset')
os.mkdir(tdir)
files = dataset[s]['path']
for sfile in tqdm(files):
filename = sfile.split('/')[-1]
dfile = os.path.join(tdir,filename)
try:
shutil.copy(sfile,dfile)
except Exception as e:
count+=1
print(count)
# ## Convert mp3 files to wav
# In[38]:
import pandas as pd
from pydub import AudioSegment
from joblib import Parallel, delayed
def convert(path,tdir):
try:
s = os.path.join(tdir,path)
d = os.path.join(tdir,path.replace('.mp3','.wav'))
# convert mp3 to wav
sound = AudioSegment.from_mp3(s)
sound.export(d, format="wav")
os.remove(s)
except:
return
split = ['test']
for s in split:
tdir = f'./Dataset/{s}/'
files = os.listdir(tdir)
Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files)
# In[89]:
import os
split = ['test']
for s in split:
tdir = f'./Dataset/{s}/'
files = os.listdir(tdir)
files = [file.replace('.wav','.mp3') for file in files]
data = [row for row in dataset[s] if row['path'].split('/')[-1] in files]
df = pd.DataFrame(data)
df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav'))
df.to_csv(f'./Dataset/{s}.csv',index=False)
# ## Clean Data
# In[90]:
Set = 'test'
file = f'./Dataset/{Set}.csv'
df = pd.read_csv(file)
# ### Remove data with unwanted characters
# In[91]:
pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å"
pattern = pattern.replace(" ","|")
pattern
# In[92]:
import re
idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ]
df = df.drop(labels=idx, axis=0)
saved_file_path = f'Dataset/{Set}-clean.csv'
df.to_csv(saved_file_path,index=False)
print(f"Number of data Removed: {len(idx)}")
# ### Remove data items with null speech array
# In[99]:
import soundfile as sf
path = f'./Dataset/{Set}'
files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0]
df = pd.read_csv(f'Dataset/{Set}-clean.csv')
idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files]
df = df.drop(labels=idx, axis=0)
df.to_csv(f'Dataset/{Set}-clean.csv',index=False)
print(f"Number of data Removed: {len(idx)}")
# In[ ]:
|