Spaces:
Runtime error
Runtime error
| import altair as alt | |
| # import datetime | |
| import joblib | |
| import nltk | |
| import numpy as np | |
| import pandas as pd | |
| import re | |
| import streamlit as st | |
| import time | |
| from gensim.corpora import Dictionary | |
| from gensim.models import KeyedVectors, TfidfModel | |
| from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex | |
| from gensim.similarities.annoy import AnnoyIndexer | |
| from io import BytesIO | |
| from nltk import pos_tag, word_tokenize | |
| from nltk.corpus import stopwords, wordnet | |
| from nltk.stem import PorterStemmer, WordNetLemmatizer | |
| from pandas.api.types import is_categorical_dtype, is_numeric_dtype | |
| from PIL import Image | |
| from scipy.sparse import csr_matrix, hstack | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| stemmer = PorterStemmer() | |
| def addZeroFeatures(matrix): | |
| maxFeatures = 18038 | |
| numDocs, numTerms = matrix.shape | |
| missingFeatures = maxFeatures - numTerms | |
| if missingFeatures > 0: | |
| zeroFeatures = csr_matrix((numDocs, missingFeatures), dtype=np.float64) | |
| matrix = hstack([matrix, zeroFeatures]) | |
| return matrix | |
| def classifyResumes(df): | |
| progressBar = st.progress(0) | |
| progressBar.progress(0, text = "Preprocessing data ...") | |
| startTime = time.time() | |
| df['cleanedResume'] = df.Resume.apply(lambda x: performStemming(x)) | |
| resumeText = df['cleanedResume'].values | |
| progressBar.progress(20, text = "Extracting features ...") | |
| vectorizer = loadTfidfVectorizer() | |
| wordFeatures = vectorizer.transform(resumeText) | |
| wordFeaturesWithZeros = addZeroFeatures(wordFeatures) | |
| progressBar.progress(40, text = "Reducing dimensionality ...") | |
| finalFeatures = dimensionalityReduction(wordFeaturesWithZeros) | |
| progressBar.progress(60, text = "Predicting categories ...") | |
| knn = loadKnnModel() | |
| predictedCategories = knn.predict(finalFeatures) | |
| progressBar.progress(80, text = "Finishing touches ...") | |
| le = loadLabelEncoder() | |
| df['Industry Category'] = le.inverse_transform(predictedCategories) | |
| df['Industry Category'] = pd.Categorical(df['Industry Category']) | |
| df.drop(columns = ['cleanedResume'], inplace = True) | |
| endTime = time.time() | |
| elapsedSeconds = endTime - startTime | |
| hours, remainder = divmod(int(elapsedSeconds), 3600) | |
| minutes, _ = divmod(remainder, 60) | |
| secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60) | |
| elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s' | |
| progressBar.progress(100, text = f'Classification Complete!') | |
| time.sleep(1) | |
| progressBar.empty() | |
| st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}') | |
| return df | |
| def clickClassify(): | |
| st.session_state.processClf = True | |
| def clickRank(): | |
| st.session_state.processRank = True | |
| def convertDfToXlsx(df): | |
| output = BytesIO() | |
| writer = pd.ExcelWriter(output, engine = 'xlsxwriter') | |
| df.to_excel(writer, index = False, sheet_name = 'Sheet1') | |
| workbook = writer.book | |
| worksheet = writer.sheets['Sheet1'] | |
| format1 = workbook.add_format({'num_format': '0.00'}) | |
| worksheet.set_column('A:A', None, format1) | |
| writer.close() | |
| processedData = output.getvalue() | |
| return processedData | |
| def createBarChart(df): | |
| valueCounts = df['Industry Category'].value_counts().reset_index() | |
| valueCounts.columns = ['Industry Category', 'Count'] | |
| newDataframe = pd.DataFrame(valueCounts) | |
| barChart = alt.Chart(newDataframe, | |
| ).mark_bar( | |
| color = '#56B6C2', | |
| size = 13 | |
| ).encode( | |
| x = alt.X('Count:Q', axis = alt.Axis(format = 'd'), title = 'Number of Resumes'), | |
| y = alt.Y('Industry Category:N', title = 'Category'), | |
| tooltip = ['Industry Category', 'Count'] | |
| ).properties( | |
| title = 'Number of Resumes per Category', | |
| ) | |
| return barChart | |
| def dimensionalityReduction(features): | |
| nca = joblib.load('nca_model.joblib') | |
| features = nca.transform(features.toarray()) | |
| return features | |
| def filterDataframeClf(df: pd.DataFrame) -> pd.DataFrame: | |
| modify = st.toggle("Add filters", key = 'filter-clf-1') | |
| if not modify: | |
| return df | |
| df = df.copy() | |
| modificationContainer = st.container() | |
| with modificationContainer: | |
| toFilterColumns = st.multiselect("Filter table on", df.columns, key = 'filter-clf-2') | |
| for column in toFilterColumns: | |
| left, right = st.columns((1, 20)) | |
| left.write("↳") | |
| widgetKey = f'filter-clf-{toFilterColumns.index(column)}-{column}' | |
| if is_categorical_dtype(df[column]): | |
| userCatInput = right.multiselect( | |
| f'Values for {column}', | |
| df[column].unique(), | |
| default = list(df[column].unique()), | |
| key = widgetKey | |
| ) | |
| df = df[df[column].isin(userCatInput)] | |
| elif is_numeric_dtype(df[column]): | |
| _min = float(df[column].min()) | |
| _max = float(df[column].max()) | |
| step = (_max - _min) / 100 | |
| userNumInput = right.slider( | |
| f'Values for {column}', | |
| min_value = _min, | |
| max_value = _max, | |
| value = (_min, _max), | |
| step = step, | |
| key = widgetKey | |
| ) | |
| df = df[df[column].between(*userNumInput)] | |
| else: | |
| userTextInput = right.text_input( | |
| f'Substring or regex in {column}', | |
| key = widgetKey | |
| ) | |
| if userTextInput: | |
| userTextInput = userTextInput.lower() | |
| df = df[df[column].astype(str).str.lower().str.contains(userTextInput)] | |
| return df | |
| def filterDataframeRnk(df: pd.DataFrame) -> pd.DataFrame: | |
| modify = st.toggle("Add filters", key = 'filter-rnk-1') | |
| if not modify: | |
| return df | |
| df = df.copy() | |
| modificationContainer = st.container() | |
| with modificationContainer: | |
| toFilterColumns = st.multiselect("Filter table on", df.columns, key = 'filter-rnk-2') | |
| for column in toFilterColumns: | |
| left, right = st.columns((1, 20)) | |
| left.write("↳") | |
| widgetKey = f'filter-rnk-{toFilterColumns.index(column)}-{column}' | |
| if is_categorical_dtype(df[column]): | |
| userCatInput = right.multiselect( | |
| f'Values for {column}', | |
| df[column].unique(), | |
| default = list(df[column].unique()), | |
| key = widgetKey | |
| ) | |
| df = df[df[column].isin(userCatInput)] | |
| elif is_numeric_dtype(df[column]): | |
| _min = float(df[column].min()) | |
| _max = float(df[column].max()) | |
| step = (_max - _min) / 100 | |
| userNumInput = right.slider( | |
| f'Values for {column}', | |
| min_value = _min, | |
| max_value = _max, | |
| value = (_min, _max), | |
| step = step, | |
| key = widgetKey | |
| ) | |
| df = df[df[column].between(*userNumInput)] | |
| else: | |
| userTextInput = right.text_input( | |
| f'Substring or regex in {column}', | |
| key = widgetKey | |
| ) | |
| if userTextInput: | |
| userTextInput = userTextInput.lower() | |
| df = df[df[column].astype(str).str.lower().str.contains(userTextInput)] | |
| return df | |
| def getWordnetPos(tag): | |
| if tag.startswith('J'): | |
| return wordnet.ADJ | |
| elif tag.startswith('V'): | |
| return wordnet.VERB | |
| elif tag.startswith('N'): | |
| return wordnet.NOUN | |
| elif tag.startswith('R'): | |
| return wordnet.ADV | |
| else: | |
| return wordnet.NOUN | |
| def loadKnnModel(): | |
| knnModelFileName = f'knn_model.joblib' | |
| return joblib.load(knnModelFileName) | |
| def loadLabelEncoder(): | |
| labelEncoderFileName = f'label_encoder.joblib' | |
| return joblib.load(labelEncoderFileName) | |
| def loadTfidfVectorizer(): | |
| tfidfVectorizerFileName = f'tfidf_vectorizer.joblib' | |
| return joblib.load(tfidfVectorizerFileName) | |
| def performLemmatization(text): | |
| text = re.sub('http\S+\s*', ' ', text) | |
| text = re.sub('RT|cc', ' ', text) | |
| text = re.sub('#\S+', '', text) | |
| text = re.sub('@\S+', ' ', text) | |
| text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) | |
| text = re.sub(r'[^\x00-\x7f]',r' ', text) | |
| text = re.sub('\s+', ' ', text) | |
| words = word_tokenize(text) | |
| words = [ | |
| lemmatizer.lemmatize(word.lower(), pos = getWordnetPos(pos)) | |
| for word, pos in pos_tag(words) if word.lower() not in stop_words | |
| ] | |
| return words | |
| def performStemming(text): | |
| text = re.sub('http\S+\s*', ' ', text) | |
| text = re.sub('RT|cc', ' ', text) | |
| text = re.sub('#\S+', '', text) | |
| text = re.sub('@\S+', ' ', text) | |
| text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) | |
| text = re.sub(r'[^\x00-\x7f]',r' ', text) | |
| text = re.sub('\s+', ' ', text) | |
| words = word_tokenize(text) | |
| words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words] | |
| text = ' '.join(words) | |
| return text | |
| def loadModel(): | |
| model_path = 'wiki-news-300d-1M-subword.vec' | |
| model = KeyedVectors.load_word2vec_format(model_path) | |
| return model | |
| model = loadModel() | |
| def rankResumes(text, df): | |
| progressBar = st.progress(0) | |
| progressBar.progress(0, text = "Preprocessing data ...") | |
| startTime = time.time() | |
| jobDescriptionText = performLemmatization(text) | |
| df['cleanedResume'] = df['Resume'].apply(lambda x: performLemmatization(x)) | |
| documents = [jobDescriptionText] + df['cleanedResume'].tolist() | |
| progressBar.progress(13, text = "Creating a dictionary ...") | |
| dictionary = Dictionary(documents) | |
| progressBar.progress(25, text = "Creating a TF-IDF model ...") | |
| tfidf = TfidfModel(dictionary = dictionary) | |
| progressBar.progress(38, text = "Creating a Similarity Index...") | |
| words = [word for word, count in dictionary.most_common()] | |
| wordVectors = model.vectors_for_all(words, allow_inference = False) | |
| indexer = AnnoyIndexer(wordVectors, num_trees = 300) | |
| similarityIndex = WordEmbeddingSimilarityIndex(wordVectors, kwargs = {'indexer': indexer}) | |
| progressBar.progress(50, text = "Creating a Similarity Matrix...") | |
| similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf) | |
| progressBar.progress(63, text = "Setting up job description as the query ...") | |
| query = tfidf[dictionary.doc2bow(jobDescriptionText)] | |
| progressBar.progress(75, text = "Calculating semantic similarities ...") | |
| index = SoftCosineSimilarity( | |
| tfidf[[dictionary.doc2bow(resume) for resume in df['cleanedResume']]], | |
| similarityMatrix | |
| ) | |
| similarities = index[query] | |
| progressBar.progress(88, text = "Finishing touches ...") | |
| df['Similarity Score (-1 to 1)'] = similarities | |
| df['Rank'] = df['Similarity Score (-1 to 1)'].rank(ascending=False, method='dense').astype(int) | |
| df.sort_values(by='Rank', inplace=True) | |
| df.drop(columns = ['cleanedResume'], inplace = True) | |
| endTime = time.time() | |
| elapsedSeconds = endTime - startTime | |
| hours, remainder = divmod(int(elapsedSeconds), 3600) | |
| minutes, _ = divmod(remainder, 60) | |
| secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60) | |
| elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s' | |
| progressBar.progress(100, text = f'Ranking Complete!') | |
| time.sleep(1) | |
| progressBar.empty() | |
| st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}') | |
| return df | |
| def writeGettingStarted(): | |
| st.write(""" | |
| ## Hello, Welcome! | |
| In today's competitive job market, the process of manually screening resumes has become a daunting task for recruiters and hiring managers. | |
| The sheer volume of applications received for a single job posting can make it extremely time-consuming to identify the most suitable candidates efficiently. | |
| This often leads to missed opportunities and the potential loss of top-tier talent. | |
| The ***Resume Screening & Classification*** website application aims to help alleviate the challenges posed by manual resume screening. | |
| The main objectives are: | |
| - To classify the resumes into their most suitable job industry category | |
| - To compare the resumes to the job description and rank them by similarity | |
| """) | |
| st.divider() | |
| st.write(""" | |
| ## Input Guide | |
| #### For the Job Description: | |
| Ensure the job description is saved in a text (.txt) file. | |
| Kindly outline the responsibilities, qualifications, and skills associated with the position. | |
| #### For the Resumes: | |
| Resumes must be compiled in an excel (.xlsx) file. | |
| The organization of columns is up to you but ensure that the "Resume" column is present. | |
| The values under this column should include all the relevant details for each resume. | |
| """) | |
| st.info(""" | |
| ##### NOTE: | |
| - If the "Resume" column is not present, the classification/ranking process will not be executed. | |
| - If there are multiple "Resume" columns, the first occurrence will be taken into account while the remaining duplicates are given a different column name. | |
| """) | |
| st.divider() | |
| st.write(""" | |
| ## Demo Walkthrough | |
| #### Classify Tab: | |
| The web app will classify the resumes into their most suitable job industry category. | |
| Currently the Category Scope consists of the following: | |
| """) | |
| column1, column2 = st.columns(2) | |
| with column1: | |
| st.write(""" | |
| - Aviation | |
| - Business development | |
| - Culinary | |
| - Education | |
| - Engineering | |
| - Finance | |
| """) | |
| with column2: | |
| st.write(""" | |
| - Fitness | |
| - Healthcare | |
| - HR | |
| - Information Technology | |
| - Public relations | |
| """) | |
| with st.expander('Classification Steps'): | |
| st.write(""" | |
| ##### Upload Resumes & Start Processing: | |
| - Navigate to the "Classify" tab. | |
| - Upload the Excel file (.xlsx) containing the resumes you want to classify. Ensure that your Excel file has the "Resume" column containing the resume texts. | |
| - Click the "Start Processing" button. | |
| - The app will analyze the resumes and categorize them into job industry categories. | |
| ###### | |
| """) | |
| imgClf1 = Image.open('clf-1.png') | |
| st.image(imgClf1, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| ##### View Bar Chart: | |
| - A bar chart will appear, showing the number of resumes per category, helping you visualize the distribution. | |
| ###### | |
| """) | |
| imgClf2 = Image.open('clf-2.png') | |
| st.image(imgClf2, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| ##### Add Filters: | |
| - You can apply filters to the dataframe to narrow down your results. | |
| ###### | |
| """) | |
| imgClf3 = Image.open('clf-3.png') | |
| st.image(imgClf3, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| ##### Donwload Results: | |
| - Once you've applied filters or are satisfied with the results, you can download the current dataframe as an Excel file by clicking the "Save Current Output as XLSX" button. | |
| #### | |
| """) | |
| imgClf4 = Image.open('clf-4.png') | |
| st.image(imgClf4, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| #### Rank Tab: | |
| The web app will rank the resumes based on their semantic similarity to the job description. | |
| The similarity score ranges from -1 to 1. | |
| A score of 1 is achieved when Document A and Document B are identical. | |
| ##### **Kindly take note:** | |
| It's important to note that these scores are not absolute and may change when more resumes are added in the comparison. | |
| The ranking algorithm dynamically adjusts its results based on the entire set of uploaded resumes. | |
| We recommend considering the scores as a relative measure rather than an absolute determination. | |
| """) | |
| with st.expander('Ranking Steps'): | |
| st.write(""" | |
| ##### Upload Files & Start Processing: | |
| - Navigate to the "Rank" tab. | |
| - Upload the job description as a text file. This file should contain the description of the job you want to compare resumes against. | |
| - Upload the Excel file that contains the resumes you want to rank. | |
| - Click the "Start Processing" button. | |
| - The app will analyze the job description and rank the resumes based on their semantic similarity to the job description. | |
| ###### | |
| """) | |
| imgRnk1 = Image.open('rnk-1.png') | |
| st.image(imgRnk1, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| ##### View Job Description: | |
| - The output will display the contents of the job description for reference. | |
| ###### | |
| """) | |
| imgRnk2 = Image.open('rnk-2.png') | |
| st.image(imgRnk2, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| ##### Add Filters: | |
| - You can apply filters to the dataframe to narrow down your results. | |
| ###### | |
| """) | |
| imgRnk3 = Image.open('rnk-3.png') | |
| st.image(imgRnk3, use_column_width = True, output_format = "PNG") | |
| st.write(""" | |
| ##### Donwload Results: | |
| - Once you've applied filters or are satisfied with the results, you can download the current dataframe as an Excel file by clicking the "Save Current Output as XLSX" button. | |
| #### | |
| """) | |
| imgRnk4 = Image.open('rnk-4.png') | |
| st.image(imgRnk4, use_column_width = True, output_format = "PNG") | |