#!/usr/bin/env python # coding: utf-8 # # **Fine-tuning XLSR-Wav2Vec2 for Multi-Lingual ASR with 🤗 Transformers** # In this notebook, I attempted to Fine-tune XLSR Wav2Vec 2.0 with the French Automatic Speech Recognition task. I have used the speech data from Common Voice Corpus 11.0. Due to the fact this fine-tuning requires heavy computational power, I used 1200 audio clips for this task. # In[49]: # import datasets # while True: # try: # datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', split='test', use_auth_token=True) # break # except: # print("Continue") dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True) # In[34]: import pandas as pd import shutil import os from tqdm import tqdm split = ['test'] count = 0 for s in split: tdir = f'./Dataset/{s}/' if not os.path.exists(tdir): if not os.path.exists('./Dataset'): os.mkdir('./Dataset') os.mkdir(tdir) files = dataset[s]['path'] for sfile in tqdm(files): filename = sfile.split('/')[-1] dfile = os.path.join(tdir,filename) try: shutil.copy(sfile,dfile) except Exception as e: count+=1 print(count) # ## Convert mp3 files to wav # In[38]: import pandas as pd from pydub import AudioSegment from joblib import Parallel, delayed def convert(path,tdir): try: s = os.path.join(tdir,path) d = os.path.join(tdir,path.replace('.mp3','.wav')) # convert mp3 to wav sound = AudioSegment.from_mp3(s) sound.export(d, format="wav") os.remove(s) except: return split = ['test'] for s in split: tdir = f'./Dataset/{s}/' files = os.listdir(tdir) Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files) # In[89]: import os split = ['test'] for s in split: tdir = f'./Dataset/{s}/' files = os.listdir(tdir) files = [file.replace('.wav','.mp3') for file in files] data = [row for row in dataset[s] if row['path'].split('/')[-1] in files] df = pd.DataFrame(data) df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav')) df.to_csv(f'./Dataset/{s}.csv',index=False) # ## Clean Data # In[90]: Set = 'test' file = f'./Dataset/{Set}.csv' df = pd.read_csv(file) # ### Remove data with unwanted characters # In[91]: pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å" pattern = pattern.replace(" ","|") pattern # In[92]: import re idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ] df = df.drop(labels=idx, axis=0) saved_file_path = f'Dataset/{Set}-clean.csv' df.to_csv(saved_file_path,index=False) print(f"Number of data Removed: {len(idx)}") # ### Remove data items with null speech array # In[99]: import soundfile as sf path = f'./Dataset/{Set}' files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0] df = pd.read_csv(f'Dataset/{Set}-clean.csv') idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files] df = df.drop(labels=idx, axis=0) df.to_csv(f'Dataset/{Set}-clean.csv',index=False) print(f"Number of data Removed: {len(idx)}") # In[ ]: