ai_advisor / data_processing.py
Nishak81478's picture
Upload 7 files
12a8aa9 verified
#import libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
# Download the NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')
# clean text
def clean_text(text):
# Convert to lowercase
text = text.lower()
# Tokenize the text using NLTK's word_tokenize
words = word_tokenize(text)
# Remove punctuation (except semicolon) and stopwords
punctuation_to_remove = set(punctuation) - {';'}
stop_words = set(stopwords.words('english'))
cleaned_words = [
word for word in words
if word not in punctuation_to_remove and word not in stop_words
]
# Join words back into a single string
cleaned_text = ' '.join(cleaned_words)
return cleaned_text
# return final dataframe
def return_clean_df():
df = pd.read_csv('AI-based Career Recommendation System.csv')
df = df[['Skills', 'Interests', 'Recommended_Career']]
# clean columns having text data
for column in df.columns:
df[column] = df[column].apply(lambda x : clean_text(x))
# concatenate all the text data
df['text'] = 'Skills : ' + df['Skills'] + '\n\n' + 'Interests : ' + df['Interests'] + '\n\n' + 'Recommended Career : ' + df['Recommended_Career']
return df['text'].to_list()