xlm-roberta-large-en-LM / Pre-Processing.py
rowan1224's picture
Upload 7 files
3df661a
#!/usr/bin/env python
# coding: utf-8
# # **Fine-tuning XLSR-Wav2Vec2 for Multi-Lingual ASR with 🤗 Transformers**
# In this notebook, I attempted to Fine-tune XLSR Wav2Vec 2.0 with the French Automatic Speech Recognition task. I have used the speech data from Common Voice Corpus 11.0. Due to the fact this fine-tuning requires heavy computational power, I used 1200 audio clips for this task.
# In[49]:
# import datasets
# while True:
# try:
# datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', split='test', use_auth_token=True)
# break
# except:
# print("Continue")
dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True)
# In[34]:
import pandas as pd
import shutil
import os
from tqdm import tqdm
split = ['test']
count = 0
for s in split:
tdir = f'./Dataset/{s}/'
if not os.path.exists(tdir):
if not os.path.exists('./Dataset'):
os.mkdir('./Dataset')
os.mkdir(tdir)
files = dataset[s]['path']
for sfile in tqdm(files):
filename = sfile.split('/')[-1]
dfile = os.path.join(tdir,filename)
try:
shutil.copy(sfile,dfile)
except Exception as e:
count+=1
print(count)
# ## Convert mp3 files to wav
# In[38]:
import pandas as pd
from pydub import AudioSegment
from joblib import Parallel, delayed
def convert(path,tdir):
try:
s = os.path.join(tdir,path)
d = os.path.join(tdir,path.replace('.mp3','.wav'))
# convert mp3 to wav
sound = AudioSegment.from_mp3(s)
sound.export(d, format="wav")
os.remove(s)
except:
return
split = ['test']
for s in split:
tdir = f'./Dataset/{s}/'
files = os.listdir(tdir)
Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files)
# In[89]:
import os
split = ['test']
for s in split:
tdir = f'./Dataset/{s}/'
files = os.listdir(tdir)
files = [file.replace('.wav','.mp3') for file in files]
data = [row for row in dataset[s] if row['path'].split('/')[-1] in files]
df = pd.DataFrame(data)
df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav'))
df.to_csv(f'./Dataset/{s}.csv',index=False)
# ## Clean Data
# In[90]:
Set = 'test'
file = f'./Dataset/{Set}.csv'
df = pd.read_csv(file)
# ### Remove data with unwanted characters
# In[91]:
pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å"
pattern = pattern.replace(" ","|")
pattern
# In[92]:
import re
idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ]
df = df.drop(labels=idx, axis=0)
saved_file_path = f'Dataset/{Set}-clean.csv'
df.to_csv(saved_file_path,index=False)
print(f"Number of data Removed: {len(idx)}")
# ### Remove data items with null speech array
# In[99]:
import soundfile as sf
path = f'./Dataset/{Set}'
files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0]
df = pd.read_csv(f'Dataset/{Set}-clean.csv')
idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files]
df = df.drop(labels=idx, axis=0)
df.to_csv(f'Dataset/{Set}-clean.csv',index=False)
print(f"Number of data Removed: {len(idx)}")
# In[ ]: