| from data_analysis import df | |
| from nltk.tokenize import word_tokenize | |
| import re | |
| import pandas as pd | |
| import nltk | |
| #Removing Duplicates | |
| # df = df.drop_duplicates(subset='Text') | |
| # df = df.reset_index(drop=True) | |
| nltk.download('punkt') | |
| # Initialize the set of non-alphanumeric characters to remove | |
| nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&', | |
| '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?', | |
| '/', '>', '<', '|', ' '] | |
| def clean_text(text): | |
| """ | |
| Function to clean and preprocess text data. | |
| """ | |
| # Tokenize the text using spaCy | |
| tokens = word_tokenize(text) | |
| # Remove non-alphanumeric characters | |
| words = [word.lower() for word in tokens if word not in nonalphanumeric] | |
| # Join the lemmatized words back into a single string | |
| cleaned_text = " ".join(words) | |
| return cleaned_text | |
| def remove_english(text): | |
| """ | |
| function that takes text as input and returns text without english words | |
| """ | |
| pat = "[a-zA-Z]+" | |
| text = re.sub(pat, "", text) | |
| return text | |
| #applying clean_text function to all rows in 'Text' column | |
| # df['clean_text'] = df['Text'].apply(clean_text) | |
| # #Removing English from Chinese text | |
| # df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset | |
| # clean_text = df.loc[df.language=='Chinese']['clean_text'] | |
| # clean_text = clean_text.apply(remove_english) # removing English words | |
| # df_Chinese.loc[:,'clean_text'] = clean_text | |
| # # Concatenate the original DataFrame with the cleaned Chinese text DataFrame | |
| # df = pd.concat([df, df_Chinese], axis=0, ignore_index=True) | |
| # # Drop rows with 'Chinese' language from the original DataFrame | |
| # df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True) | |
| # # shuffling dataframe and resetting index | |
| # df = df.sample(frac=1).reset_index(drop=True) |