Spaces:
Sleeping
Sleeping
| #import libraries | |
| import pandas as pd | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from string import punctuation | |
| # Download the NLTK stopwords | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| # clean text | |
| def clean_text(text): | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Tokenize the text using NLTK's word_tokenize | |
| words = word_tokenize(text) | |
| # Remove punctuation (except semicolon) and stopwords | |
| punctuation_to_remove = set(punctuation) - {';'} | |
| stop_words = set(stopwords.words('english')) | |
| cleaned_words = [ | |
| word for word in words | |
| if word not in punctuation_to_remove and word not in stop_words | |
| ] | |
| # Join words back into a single string | |
| cleaned_text = ' '.join(cleaned_words) | |
| return cleaned_text | |
| # return final dataframe | |
| def return_clean_df(): | |
| df = pd.read_csv('AI-based Career Recommendation System.csv') | |
| df = df[['Skills', 'Interests', 'Recommended_Career']] | |
| # clean columns having text data | |
| for column in df.columns: | |
| df[column] = df[column].apply(lambda x : clean_text(x)) | |
| # concatenate all the text data | |
| df['text'] = 'Skills : ' + df['Skills'] + '\n\n' + 'Interests : ' + df['Interests'] + '\n\n' + 'Recommended Career : ' + df['Recommended_Career'] | |
| return df['text'].to_list() | |