|
|
from packages import *
|
|
|
from utils import *
|
|
|
|
|
|
def preprocess_fn(dataset: pd.DataFrame, normaliztion= False, show_sample= False) -> pd.DataFrame:
|
|
|
|
|
|
dataset= dataset.drop_duplicates(subset='text')
|
|
|
dataset= dataset.copy()
|
|
|
|
|
|
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'http\S+|www\S+', '', x) if isinstance(x, str) else x)
|
|
|
|
|
|
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'<[^>]+>', '', x) if isinstance(x, str) else x)
|
|
|
|
|
|
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[^\u0600-\u06FF\s.,!?]', '', x) if isinstance(x, str) else x)
|
|
|
|
|
|
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'[a-zA-Z]+', '', x) if isinstance(x, str) else x)
|
|
|
|
|
|
dataset= dataset[dataset['text'].notna() & (dataset['text'].str.strip() != '')]
|
|
|
|
|
|
dataset['text']= dataset['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x, str) else x)
|
|
|
|
|
|
if normaliztion:
|
|
|
dataset['text']= dataset['text'].apply(lambda x: Normalizer(persian_numbers= True).normalize(x) if isinstance(x, str) else x)
|
|
|
if show_sample:
|
|
|
print(dataset['text'].sample(n= 3).to_list())
|
|
|
return dataset
|
|
|
|
|
|
|
|
|
def uniform_length(dataset: pd.DataFrame, target_length= 1000) -> pd.DataFrame:
|
|
|
full_text= ' '.join(dataset['text'].dropna().astype(str))
|
|
|
chunks, start= [], 0
|
|
|
while start < len(full_text):
|
|
|
end= min(start+ target_length, len(full_text))
|
|
|
while end < len(full_text) and full_text[end] not in [' ', '\n']:
|
|
|
end +=1
|
|
|
chunks.append(full_text[start: end].strip())
|
|
|
start= end +1
|
|
|
return pd.DataFrame(chunks, columns= ['text'])
|
|
|
|
|
|
|
|
|
def preprocess_pipeline_fn(eda= False, save_dataset= True, shuffle= False):
|
|
|
|
|
|
wiki_dataset= load_dataset("wikimedia/wikipedia", "20231101.fa")
|
|
|
wiki_dataset= pd.DataFrame(wiki_dataset['train']['text'], columns= ['text'])
|
|
|
blog_dataset= pd.read_csv("./blogs/blogs.csv")
|
|
|
homorich_dataset= load_dataset("MahtaFetrat/HomoRich-G2P-Persian", verification_mode= "no_checks")
|
|
|
homorich_dataset= pd.DataFrame(homorich_dataset["train"]["Grapheme"], columns= ['text'])
|
|
|
print(70*"-"), print("download & load datasets is done."), print(70*"-")
|
|
|
|
|
|
|
|
|
dataset_names= ["Wikipedia", "Persian Blog", "HomoRich"]
|
|
|
datasets= [wiki_dataset, blog_dataset, homorich_dataset]
|
|
|
del wiki_dataset, blog_dataset, homorich_dataset
|
|
|
for i in range(len(datasets)):
|
|
|
datasets[i]= uniform_length(preprocess_fn(datasets[i]))
|
|
|
if eda:
|
|
|
eda_dataset(datasets[i], dataset_names[i])
|
|
|
show_short_samples(datasets[i], dataset_names[i])
|
|
|
|
|
|
|
|
|
merged_dataset= pd.concat([datasets[0], datasets[1], datasets[2]], ignore_index= True)
|
|
|
del datasets
|
|
|
if shuffle: merged_dataset= merged_dataset.sample(frac= 1).reset_index(drop= True)
|
|
|
print(70*"-"), print("preprocessing is done."), print(70*"-")
|
|
|
if save_dataset: merged_dataset.to_csv('merged_dataset.csv', index= False, encoding= 'utf-8-sig')
|
|
|
|
|
|
return merged_dataset |