#!/usr/bin/env python
# coding: utf-8

# # **Fine-tuning XLSR-Wav2Vec2 for Multi-Lingual ASR with 🤗 Transformers**

# In this notebook, I attempted to Fine-tune XLSR Wav2Vec 2.0 with the French Automatic Speech Recognition task. I have used the speech data from Common Voice Corpus 11.0. Due to the fact this fine-tuning requires heavy computational power, I used 1200 audio clips for this task. 

# In[49]:


# import datasets

# while True:
#     try:
#         datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', split='test', use_auth_token=True)
#         break
#     except:
#         print("Continue")
    
dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True)


# In[34]:


import pandas as pd
import shutil
import os
from tqdm import tqdm


split = ['test']
count = 0
for s in split:
    
    tdir = f'./Dataset/{s}/'
    if not os.path.exists(tdir):
        if not os.path.exists('./Dataset'):
            os.mkdir('./Dataset')
        os.mkdir(tdir)

    files = dataset[s]['path']
    
    
    for sfile in tqdm(files):
        filename = sfile.split('/')[-1]
        dfile = os.path.join(tdir,filename)
        try:
            shutil.copy(sfile,dfile)
        except Exception as e:
            count+=1

print(count)


# ## Convert mp3 files to wav

# In[38]:


import pandas as pd
from pydub import AudioSegment
from joblib import Parallel, delayed

def convert(path,tdir):
       
    try:
        s = os.path.join(tdir,path)
        d = os.path.join(tdir,path.replace('.mp3','.wav'))
        # convert mp3 to wav                                                            
        sound = AudioSegment.from_mp3(s)
        sound.export(d, format="wav")

        os.remove(s)
    except:
        return
    
    
split = ['test']

for s in split:
    
    tdir = f'./Dataset/{s}/'
    files = os.listdir(tdir)
    
    Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files)
    

# In[89]:


import os
split = ['test']
for s in split:
    
    
    tdir = f'./Dataset/{s}/'
    files = os.listdir(tdir)
    files = [file.replace('.wav','.mp3') for file in files]
    data = [row for row in dataset[s] if row['path'].split('/')[-1] in files]
    
    df = pd.DataFrame(data)
    df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav'))
    df.to_csv(f'./Dataset/{s}.csv',index=False)


# ## Clean Data

# In[90]:


Set = 'test'
file = f'./Dataset/{Set}.csv'
df = pd.read_csv(file)


# ### Remove data with unwanted characters

# In[91]:


pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å"
pattern = pattern.replace(" ","|")
pattern


# In[92]:


import re
idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ]
df = df.drop(labels=idx, axis=0)
saved_file_path = f'Dataset/{Set}-clean.csv'
df.to_csv(saved_file_path,index=False)
print(f"Number of data Removed: {len(idx)}")


# ### Remove data items with null speech array

# In[99]:


import soundfile as sf

path = f'./Dataset/{Set}'
files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0]

df = pd.read_csv(f'Dataset/{Set}-clean.csv')
idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files]
df = df.drop(labels=idx, axis=0)
df.to_csv(f'Dataset/{Set}-clean.csv',index=False)
print(f"Number of data Removed: {len(idx)}")


# In[ ]: