Spaces:

Nishak81478
/

ai_advisor

Sleeping

ai_advisor / data_processing.py

Upload 7 files

12a8aa9 verified about 1 year ago

1.41 kB

	#import libraries
	import pandas as pd
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from string import punctuation

	# Download the NLTK stopwords
	nltk.download('stopwords')
	nltk.download('punkt')


	# clean text
	def clean_text(text):
	# Convert to lowercase
	text = text.lower()

	# Tokenize the text using NLTK's word_tokenize
	words = word_tokenize(text)

	# Remove punctuation (except semicolon) and stopwords
	punctuation_to_remove = set(punctuation) - {';'}
	stop_words = set(stopwords.words('english'))

	cleaned_words = [
	word for word in words
	if word not in punctuation_to_remove and word not in stop_words
	]

	# Join words back into a single string
	cleaned_text = ' '.join(cleaned_words)
	return cleaned_text


	# return final dataframe
	def return_clean_df():
	df = pd.read_csv('AI-based Career Recommendation System.csv')
	df = df[['Skills', 'Interests', 'Recommended_Career']]

	# clean columns having text data
	for column in df.columns:
	df[column] = df[column].apply(lambda x : clean_text(x))

	# concatenate all the text data
	df['text'] = 'Skills : ' + df['Skills'] + '\n\n' + 'Interests : ' + df['Interests'] + '\n\n' + 'Recommended Career : ' + df['Recommended_Career']
	return df['text'].to_list()