#import libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

# Download the NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')


# clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text using NLTK's word_tokenize
    words = word_tokenize(text)
    
    # Remove punctuation (except semicolon) and stopwords
    punctuation_to_remove = set(punctuation) - {';'}
    stop_words = set(stopwords.words('english'))
    
    cleaned_words = [
        word for word in words
        if word not in punctuation_to_remove and word not in stop_words
    ]
    
    # Join words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text


# return final dataframe
def return_clean_df():
    df = pd.read_csv('AI-based Career Recommendation System.csv')
    df = df[['Skills', 'Interests', 'Recommended_Career']]

    # clean columns having text data
    for column in df.columns:
        df[column] = df[column].apply(lambda x : clean_text(x))
    
    # concatenate all the text data
    df['text'] = 'Skills :  ' + df['Skills'] + '\n\n' + 'Interests :  ' + df['Interests'] + '\n\n' + 'Recommended Career :  ' + df['Recommended_Career']
    return df['text'].to_list()