#import libraries import pandas as pd import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from string import punctuation # Download the NLTK stopwords nltk.download('stopwords') nltk.download('punkt') # clean text def clean_text(text): # Convert to lowercase text = text.lower() # Tokenize the text using NLTK's word_tokenize words = word_tokenize(text) # Remove punctuation (except semicolon) and stopwords punctuation_to_remove = set(punctuation) - {';'} stop_words = set(stopwords.words('english')) cleaned_words = [ word for word in words if word not in punctuation_to_remove and word not in stop_words ] # Join words back into a single string cleaned_text = ' '.join(cleaned_words) return cleaned_text # return final dataframe def return_clean_df(): df = pd.read_csv('AI-based Career Recommendation System.csv') df = df[['Skills', 'Interests', 'Recommended_Career']] # clean columns having text data for column in df.columns: df[column] = df[column].apply(lambda x : clean_text(x)) # concatenate all the text data df['text'] = 'Skills : ' + df['Skills'] + '\n\n' + 'Interests : ' + df['Interests'] + '\n\n' + 'Recommended Career : ' + df['Recommended_Career'] return df['text'].to_list()