|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset = datasets.load_dataset('mozilla-foundation/common_voice_8_0', 'fr', use_auth_token=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import shutil |
|
|
import os |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
split = ['test'] |
|
|
count = 0 |
|
|
for s in split: |
|
|
|
|
|
tdir = f'./Dataset/{s}/' |
|
|
if not os.path.exists(tdir): |
|
|
if not os.path.exists('./Dataset'): |
|
|
os.mkdir('./Dataset') |
|
|
os.mkdir(tdir) |
|
|
|
|
|
files = dataset[s]['path'] |
|
|
|
|
|
|
|
|
for sfile in tqdm(files): |
|
|
filename = sfile.split('/')[-1] |
|
|
dfile = os.path.join(tdir,filename) |
|
|
try: |
|
|
shutil.copy(sfile,dfile) |
|
|
except Exception as e: |
|
|
count+=1 |
|
|
|
|
|
print(count) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from pydub import AudioSegment |
|
|
from joblib import Parallel, delayed |
|
|
|
|
|
def convert(path,tdir): |
|
|
|
|
|
try: |
|
|
s = os.path.join(tdir,path) |
|
|
d = os.path.join(tdir,path.replace('.mp3','.wav')) |
|
|
|
|
|
sound = AudioSegment.from_mp3(s) |
|
|
sound.export(d, format="wav") |
|
|
|
|
|
os.remove(s) |
|
|
except: |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
split = ['test'] |
|
|
|
|
|
for s in split: |
|
|
|
|
|
tdir = f'./Dataset/{s}/' |
|
|
files = os.listdir(tdir) |
|
|
|
|
|
Parallel(n_jobs=16)(delayed(convert)(path,tdir) for path in files) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
split = ['test'] |
|
|
for s in split: |
|
|
|
|
|
|
|
|
tdir = f'./Dataset/{s}/' |
|
|
files = os.listdir(tdir) |
|
|
files = [file.replace('.wav','.mp3') for file in files] |
|
|
data = [row for row in dataset[s] if row['path'].split('/')[-1] in files] |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
df['path'] = df['path'].apply(lambda x: tdir+x.split('/')[-1].replace('.mp3','.wav')) |
|
|
df.to_csv(f'./Dataset/{s}.csv',index=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Set = 'test' |
|
|
file = f'./Dataset/{Set}.csv' |
|
|
df = pd.read_csv(file) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pattern = "& þ 생 \* 삼 ę ń ą đ ṇ ̃ ğ の ö ñ ̧ 집 ý ș ė ã я ľ ō ́ ̀ ø Ø Š ć … ̈ ź ž Ž ś ū \$ ổ ä ̂ ț „ ß 먹 č ā ş ł 기 ř – í ó ú µ š ă ī ż ʔ × ひ \( å 고 ē ÿ ð Я Å" |
|
|
pattern = pattern.replace(" ","|") |
|
|
pattern |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
idx = [i for i in range(df.shape[0]) if re.search(pattern, df.iloc[i]['sentence']) ] |
|
|
df = df.drop(labels=idx, axis=0) |
|
|
saved_file_path = f'Dataset/{Set}-clean.csv' |
|
|
df.to_csv(saved_file_path,index=False) |
|
|
print(f"Number of data Removed: {len(idx)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import soundfile as sf |
|
|
|
|
|
path = f'./Dataset/{Set}' |
|
|
files = [os.path.join(path,p) for p in os.listdir(path) if sf.read(os.path.join(path,p))[0].shape[0]==0] |
|
|
|
|
|
df = pd.read_csv(f'Dataset/{Set}-clean.csv') |
|
|
idx = [i for i in range(df.shape[0]) if df.iloc[i]['path'] in files] |
|
|
df = df.drop(labels=idx, axis=0) |
|
|
df.to_csv(f'Dataset/{Set}-clean.csv',index=False) |
|
|
print(f"Number of data Removed: {len(idx)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|