garNER
/

xlm-roberta-large-en-LM

Token Classification

Model card Files Files and versions

xlm-roberta-large-en-LM / Pre-Processing.py

rowan1224's picture

Upload 7 files

3df661a about 3 years ago

history blame contribute delete

3.59 kB

	#!/usr/bin/env python
	# coding: utf-8

	# # Fine-tuning XLSR-Wav2Vec2 for Multi-Lingual ASR with 🤗 Transformers

	# In this notebook, I attempted to Fine-tune XLSR Wav2Vec 2.0 with the French Automatic Speech Recognition task. I have used the speech data from Common Voice Corpus 11.0. Due to the fact this fine-tuning requires heavy computational power, I used 1200 audio clips for this task.

	# In[49]:


	# import datasets

	# while True:
	# try:
	# datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', split='test', use_auth_token=True)
	# break
	# except:
	# print("Continue")

	dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True)


	# In[34]:


	import pandas as pd
	import shutil
	import os
	from tqdm import tqdm


	split = ['test']
	count = 0
	for s in split:

	tdir = f'./Dataset/{s}/'
	if not os.path.exists(tdir):
	if not os.path.exists('./Dataset'):
	os.mkdir('./Dataset')
	os.mkdir(tdir)

	files = dataset[s]['path']


	for sfile in tqdm(files):
	filename = sfile.split('/')[-1]
	dfile = os.path.join(tdir,filename)
	try:
	shutil.copy(sfile,dfile)
	except Exception as e:
	count+=1

	print(count)


	# ## Convert mp3 files to wav

	# In[38]:


	import pandas as pd
	from pydub import AudioSegment
	from joblib import Parallel, delayed

	def convert(path,tdir):

	try:
	s = os.path.join(tdir,path)
	d = os.path.join(tdir,path.replace('.mp3','.wav'))
	# convert mp3 to wav
	sound = AudioSegment.from_mp3(s)
	sound.export(d, format="wav")

	os.remove(s)
	except:
	return



	split = ['test']

	for s in split:

	tdir = f'./Dataset/{s}/'
	files = os.listdir(tdir)

	Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files)



	# In[89]:


	import os
	split = ['test']
	for s in split:


	tdir = f'./Dataset/{s}/'
	files = os.listdir(tdir)
	files = [file.replace('.wav','.mp3') for file in files]
	data = [row for row in dataset[s] if row['path'].split('/')[-1] in files]

	df = pd.DataFrame(data)
	df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav'))
	df.to_csv(f'./Dataset/{s}.csv',index=False)


	# ## Clean Data

	# In[90]:


	Set = 'test'
	file = f'./Dataset/{Set}.csv'
	df = pd.read_csv(file)


	# ### Remove data with unwanted characters

	# In[91]:


	pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å"
	pattern = pattern.replace(" ","\|")
	pattern


	# In[92]:


	import re
	idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ]
	df = df.drop(labels=idx, axis=0)
	saved_file_path = f'Dataset/{Set}-clean.csv'
	df.to_csv(saved_file_path,index=False)
	print(f"Number of data Removed: {len(idx)}")


	# ### Remove data items with null speech array

	# In[99]:


	import soundfile as sf

	path = f'./Dataset/{Set}'
	files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0]

	df = pd.read_csv(f'Dataset/{Set}-clean.csv')
	idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files]
	df = df.drop(labels=idx, axis=0)
	df.to_csv(f'Dataset/{Set}-clean.csv',index=False)
	print(f"Number of data Removed: {len(idx)}")


	# In[ ]: