hassaanik
/

Language_Identifier

Model card Files Files and versions

Language_Identifier / data_cleaning.py

hassaanik's picture

Upload 8 files

24bf069 verified almost 2 years ago

history blame contribute delete

1.88 kB

	from data_analysis import df
	from nltk.tokenize import word_tokenize
	import re
	import pandas as pd
	import nltk

	#Removing Duplicates
	# df = df.drop_duplicates(subset='Text')
	# df = df.reset_index(drop=True)

	nltk.download('punkt')
	# Initialize the set of non-alphanumeric characters to remove
	nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
	'*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
	'/', '>', '<', '\|', ' ']

	def clean_text(text):
	"""
	Function to clean and preprocess text data.
	"""
	# Tokenize the text using spaCy
	tokens = word_tokenize(text)

	# Remove non-alphanumeric characters
	words = [word.lower() for word in tokens if word not in nonalphanumeric]

	# Join the lemmatized words back into a single string
	cleaned_text = " ".join(words)

	return cleaned_text

	def remove_english(text):
	"""
	function that takes text as input and returns text without english words
	"""
	pat = "[a-zA-Z]+"
	text = re.sub(pat, "", text)
	return text


	#applying clean_text function to all rows in 'Text' column
	# df['clean_text'] = df['Text'].apply(clean_text)



	# #Removing English from Chinese text
	# df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset

	# clean_text = df.loc[df.language=='Chinese']['clean_text']
	# clean_text = clean_text.apply(remove_english) # removing English words
	# df_Chinese.loc[:,'clean_text'] = clean_text

	# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
	# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)

	# # Drop rows with 'Chinese' language from the original DataFrame
	# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)


	# # shuffling dataframe and resetting index
	# df = df.sample(frac=1).reset_index(drop=True)