Spaces:
Runtime error
Runtime error
Commit
·
3cdb53b
1
Parent(s):
f9848af
Upload 2 files
Browse files
app.py
CHANGED
|
@@ -59,6 +59,7 @@ with tab2:
|
|
| 59 |
st.divider()
|
| 60 |
st.header('Output')
|
| 61 |
resumeClf = pd.read_excel(uploadedResumeClf)
|
|
|
|
| 62 |
if 'Resume' in resumeClf.columns:
|
| 63 |
resumeClf = classifyResumes(resumeClf)
|
| 64 |
with st.expander('View Bar Chart'):
|
|
@@ -98,6 +99,7 @@ with tab3:
|
|
| 98 |
st.header('Output')
|
| 99 |
jobDescriptionRnk = uploadedJobDescriptionRnk.read().decode('utf-8')
|
| 100 |
resumeRnk = pd.read_excel(uploadedResumeRnk)
|
|
|
|
| 101 |
if 'Resume' in resumeRnk.columns:
|
| 102 |
resumeRnk = rankResumes(jobDescriptionRnk, resumeRnk)
|
| 103 |
with st.expander('View Job Description'):
|
|
|
|
| 59 |
st.divider()
|
| 60 |
st.header('Output')
|
| 61 |
resumeClf = pd.read_excel(uploadedResumeClf)
|
| 62 |
+
|
| 63 |
if 'Resume' in resumeClf.columns:
|
| 64 |
resumeClf = classifyResumes(resumeClf)
|
| 65 |
with st.expander('View Bar Chart'):
|
|
|
|
| 99 |
st.header('Output')
|
| 100 |
jobDescriptionRnk = uploadedJobDescriptionRnk.read().decode('utf-8')
|
| 101 |
resumeRnk = pd.read_excel(uploadedResumeRnk)
|
| 102 |
+
|
| 103 |
if 'Resume' in resumeRnk.columns:
|
| 104 |
resumeRnk = rankResumes(jobDescriptionRnk, resumeRnk)
|
| 105 |
with st.expander('View Job Description'):
|
utils.py
CHANGED
|
@@ -40,7 +40,6 @@ def addZeroFeatures(matrix):
|
|
| 40 |
|
| 41 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
| 42 |
def classifyResumes(df):
|
| 43 |
-
# WITH PROGRESS BAR
|
| 44 |
progressBar = st.progress(0)
|
| 45 |
progressBar.progress(0, text = "Preprocessing data ...")
|
| 46 |
startTime = time.time()
|
|
@@ -72,29 +71,6 @@ def classifyResumes(df):
|
|
| 72 |
st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}')
|
| 73 |
return df
|
| 74 |
|
| 75 |
-
# NO LOADING WIDGET
|
| 76 |
-
# startTime = time.time()
|
| 77 |
-
# df['cleanedResume'] = df.Resume.apply(lambda x: performStemming(x))
|
| 78 |
-
# resumeText = df['cleanedResume'].values
|
| 79 |
-
# vectorizer = loadTfidfVectorizer()
|
| 80 |
-
# wordFeatures = vectorizer.transform(resumeText)
|
| 81 |
-
# wordFeaturesWithZeros = addZeroFeatures(wordFeatures)
|
| 82 |
-
# finalFeatures = dimensionalityReduction(wordFeaturesWithZeros)
|
| 83 |
-
# knn = loadKnnModel()
|
| 84 |
-
# predictedCategories = knn.predict(finalFeatures)
|
| 85 |
-
# le = loadLabelEncoder()
|
| 86 |
-
# df['Industry Category'] = le.inverse_transform(predictedCategories)
|
| 87 |
-
# df['Industry Category'] = pd.Categorical(df['Industry Category'])
|
| 88 |
-
# df.drop(columns = ['cleanedResume'], inplace = True)
|
| 89 |
-
# endTime = time.time()
|
| 90 |
-
# elapsedSeconds = endTime - startTime
|
| 91 |
-
# elapsedTime = datetime.timedelta(seconds = elapsedSeconds)
|
| 92 |
-
# hours, remainder = divmod(elapsedTime.seconds, 3600)
|
| 93 |
-
# minutes, seconds = divmod(remainder, 60)
|
| 94 |
-
# elapsedTimeStr = f"{hours} hr {minutes} min {seconds} sec"
|
| 95 |
-
# st.info(f'Finished in {elapsedTimeStr}')
|
| 96 |
-
# return df
|
| 97 |
-
|
| 98 |
def clickClassify():
|
| 99 |
st.session_state.processClf = True
|
| 100 |
|
|
@@ -283,7 +259,6 @@ model = loadModel()
|
|
| 283 |
|
| 284 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
| 285 |
def rankResumes(text, df):
|
| 286 |
-
# WITH PROGRESS BAR
|
| 287 |
progressBar = st.progress(0)
|
| 288 |
progressBar.progress(0, text = "Preprocessing data ...")
|
| 289 |
startTime = time.time()
|
|
@@ -326,156 +301,6 @@ def rankResumes(text, df):
|
|
| 326 |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
|
| 327 |
return df
|
| 328 |
|
| 329 |
-
# NO LOADING WIDGET
|
| 330 |
-
# startTime = time.time()
|
| 331 |
-
# jobDescriptionText = performLemmatization(text)
|
| 332 |
-
# df['cleanedResume'] = df['Resume'].apply(lambda x: performLemmatization(x))
|
| 333 |
-
# documents = [jobDescriptionText] + df['cleanedResume'].tolist()
|
| 334 |
-
# dictionary = Dictionary(documents)
|
| 335 |
-
# tfidf = TfidfModel(dictionary = dictionary)
|
| 336 |
-
# similarityIndex = WordEmbeddingSimilarityIndex(model)
|
| 337 |
-
# similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
|
| 338 |
-
# query = tfidf[dictionary.doc2bow(jobDescriptionText)]
|
| 339 |
-
# index = SoftCosineSimilarity(
|
| 340 |
-
# tfidf[[dictionary.doc2bow(resume) for resume in df['cleanedResume']]],
|
| 341 |
-
# similarityMatrix
|
| 342 |
-
# )
|
| 343 |
-
# similarities = index[query]
|
| 344 |
-
# df['Similarity Score'] = similarities
|
| 345 |
-
# df.sort_values(by = 'Similarity Score', ascending = False, inplace = True)
|
| 346 |
-
# df.drop(columns = ['cleanedResume'], inplace = True)
|
| 347 |
-
# endTime = time.time()
|
| 348 |
-
# elapsedSeconds = endTime - startTime
|
| 349 |
-
# elapsedTime = datetime.timedelta(seconds = elapsedSeconds)
|
| 350 |
-
# hours, remainder = divmod(elapsedTime.seconds, 3600)
|
| 351 |
-
# minutes, seconds = divmod(remainder, 60)
|
| 352 |
-
# elapsedTimeStr = f"{hours} hr {minutes} min {seconds} sec"
|
| 353 |
-
# st.info(f'Finished in {elapsedTimeStr}')
|
| 354 |
-
# return df
|
| 355 |
-
|
| 356 |
-
# TF-IDF + LSA + COSSIM
|
| 357 |
-
# from sklearn.decomposition import TruncatedSVD
|
| 358 |
-
# import math
|
| 359 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
| 360 |
-
# jobDescriptionRnk = preprocessing(jobDescriptionRnk)
|
| 361 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing(x))
|
| 362 |
-
# resumes = resumeRnk['cleanedResume'].values
|
| 363 |
-
# # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words = 'english')
|
| 364 |
-
# # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True)
|
| 365 |
-
# # tfidfVectorizer = TfidfVectorizer(stop_words = 'english')
|
| 366 |
-
# tfidfVectorizer = TfidfVectorizer()
|
| 367 |
-
# tfidfMatrix = tfidfVectorizer.fit_transform([jobDescriptionRnk] + list(resumes))
|
| 368 |
-
# num_features = len(tfidfVectorizer.get_feature_names_out())
|
| 369 |
-
# st.write(f"Number of TF-IDF Features: {num_features}")
|
| 370 |
-
# nComponents = math.ceil(len(resumes) * 0.55)
|
| 371 |
-
# # nComponents = math.ceil(num_features * 0.01)
|
| 372 |
-
# # nComponents = 5
|
| 373 |
-
# st.write(nComponents)
|
| 374 |
-
# # nComponents = len(resumes)
|
| 375 |
-
# lsa = TruncatedSVD(n_components=nComponents)
|
| 376 |
-
# lsaMatrix = lsa.fit_transform(tfidfMatrix)
|
| 377 |
-
# similarityScores = cosine_similarity(lsaMatrix[0:1], lsaMatrix[1:])
|
| 378 |
-
# resumeRnk['Similarity Score (%)'] = similarityScores[0] * 100
|
| 379 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
| 380 |
-
# del resumeRnk['cleanedResume']
|
| 381 |
-
# return resumeRnk
|
| 382 |
-
|
| 383 |
-
# 1 BY 1 SOFT COSSIM
|
| 384 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
| 385 |
-
# jobDescriptionText = preprocessing2(jobDescriptionRnk)
|
| 386 |
-
# resumeRnk['cleanedResume'] = resumeRnk['Resume'].apply(lambda x: preprocessing2(x))
|
| 387 |
-
# similarityscore = []
|
| 388 |
-
# for resume in resumeRnk['cleanedResume']:
|
| 389 |
-
# documents = [jobDescriptionText, resume]
|
| 390 |
-
# dictionary = Dictionary(documents)
|
| 391 |
-
# documentBow = [dictionary.doc2bow(doc) for doc in documents]
|
| 392 |
-
# tfidf = TfidfModel(documentBow, dictionary=dictionary)
|
| 393 |
-
# similarityIndex = WordEmbeddingSimilarityIndex(model)
|
| 394 |
-
# similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
|
| 395 |
-
# # similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary)
|
| 396 |
-
# value = tfidf[dictionary.doc2bow(resume)]
|
| 397 |
-
# # value = dictionary.doc2bow(jobDescriptionText)
|
| 398 |
-
# index = SoftCosineSimilarity(
|
| 399 |
-
# # tfidf[[dictionary.doc2bow(resume)]],
|
| 400 |
-
# tfidf[[dictionary.doc2bow(jobDescriptionText)]],
|
| 401 |
-
# # [dictionary.doc2bow(resume) for resume in resumeRnk['cleanedResume']],
|
| 402 |
-
# similarityMatrix,
|
| 403 |
-
# )
|
| 404 |
-
# similarities = index[value]
|
| 405 |
-
# similarityscore.append(similarities)
|
| 406 |
-
# print(similarityscore)
|
| 407 |
-
# resumeRnk['Similarity Score'] = similarityscore
|
| 408 |
-
# resumeRnk.sort_values(by='Similarity Score', ascending=False, inplace=True)
|
| 409 |
-
# resumeRnk.drop(columns=['cleanedResume'], inplace=True)
|
| 410 |
-
# return resumeRnk
|
| 411 |
-
#
|
| 412 |
-
# TF-IDF SCORE + WORD EMBEDDINGS SCORE
|
| 413 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
| 414 |
-
# def get_word_embedding(text):
|
| 415 |
-
# words = text.split()
|
| 416 |
-
# valid_words = [word for word in text.split() if word in model]
|
| 417 |
-
# if valid_words:
|
| 418 |
-
# return np.mean([model[word] for word in valid_words], axis=0)
|
| 419 |
-
# else:
|
| 420 |
-
# return np.zeros(model.vector_size)
|
| 421 |
-
# jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
|
| 422 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
|
| 423 |
-
# tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words='english')
|
| 424 |
-
# jobTfidf = tfidfVectorizer.fit_transform([jobDescriptionRnk])
|
| 425 |
-
# jobDescriptionEmbedding = get_word_embedding(jobDescriptionRnk)
|
| 426 |
-
# resumeSimilarities = []
|
| 427 |
-
# for resumeContent in resumeRnk['cleanedResume']:
|
| 428 |
-
# resumeEmbedding = get_word_embedding(resumeContent)
|
| 429 |
-
# similarityFastText = cosine_similarity([jobDescriptionEmbedding], [resumeEmbedding])[0][0]
|
| 430 |
-
# similarityTFIDF = cosine_similarity(jobTfidf, tfidfVectorizer.transform([resumeContent]))[0][0]
|
| 431 |
-
# similarity = (0.6 * similarityTFIDF) + (0.4 * similarityFastText)
|
| 432 |
-
# final_similarity = similarity * 100
|
| 433 |
-
# resumeSimilarities.append(final_similarity)
|
| 434 |
-
# resumeRnk['Similarity Score (%)'] = resumeSimilarities
|
| 435 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
| 436 |
-
# del resumeRnk['cleanedResume']
|
| 437 |
-
# return resumeRnk
|
| 438 |
-
|
| 439 |
-
# WORD EMBEDDINGS + COSSIM
|
| 440 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
| 441 |
-
# def get_word_embedding(text):
|
| 442 |
-
# words = text.split()
|
| 443 |
-
# valid_words = [word for word in text.split() if word in model]
|
| 444 |
-
# if valid_words:
|
| 445 |
-
# return np.mean([model[word] for word in valid_words], axis=0)
|
| 446 |
-
# else:
|
| 447 |
-
# return np.zeros(model.vector_size)
|
| 448 |
-
# jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
|
| 449 |
-
# jobDescriptionEmbedding = get_word_embedding(jobDescriptionRnk)
|
| 450 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
|
| 451 |
-
# resumeSimilarities = []
|
| 452 |
-
# for resumeContent in resumeRnk['cleanedResume']:
|
| 453 |
-
# resumeEmbedding = get_word_embedding(resumeContent)
|
| 454 |
-
# similarity = cosine_similarity([jobDescriptionEmbedding], [resumeEmbedding])[0][0]
|
| 455 |
-
# percentageSimilarity = similarity * 100
|
| 456 |
-
# resumeSimilarities.append(percentageSimilarity)
|
| 457 |
-
# resumeRnk['Similarity Score (%)'] = resumeSimilarities
|
| 458 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
| 459 |
-
# del resumeRnk['cleanedResume']
|
| 460 |
-
# return resumeRnk
|
| 461 |
-
|
| 462 |
-
# TF-IDF + COSSIM
|
| 463 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
| 464 |
-
# jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
|
| 465 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
|
| 466 |
-
# tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words='english')
|
| 467 |
-
# jobTfidf = tfidfVectorizer.fit_transform([jobDescriptionRnk])
|
| 468 |
-
# resumeSimilarities = []
|
| 469 |
-
# for resumeContent in resumeRnk['cleanedResume']:
|
| 470 |
-
# resumeTfidf = tfidfVectorizer.transform([resumeContent])
|
| 471 |
-
# similarity = cosine_similarity(jobTfidf, resumeTfidf)
|
| 472 |
-
# percentageSimilarity = (similarity[0][0] * 100)
|
| 473 |
-
# resumeSimilarities.append(percentageSimilarity)
|
| 474 |
-
# resumeRnk['Similarity Score (%)'] = resumeSimilarities
|
| 475 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
| 476 |
-
# del resumeRnk['cleanedResume']
|
| 477 |
-
# return resumeRnk
|
| 478 |
-
|
| 479 |
def writeGettingStarted():
|
| 480 |
st.write("""
|
| 481 |
## Hello, Welcome!
|
|
@@ -500,6 +325,11 @@ def writeGettingStarted():
|
|
| 500 |
The organization of columns is up to you but ensure that the "Resume" column is present.
|
| 501 |
The values under this column should include all the relevant details for each resume.
|
| 502 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
st.divider()
|
| 504 |
st.write("""
|
| 505 |
## Demo Walkthrough
|
|
|
|
| 40 |
|
| 41 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
| 42 |
def classifyResumes(df):
|
|
|
|
| 43 |
progressBar = st.progress(0)
|
| 44 |
progressBar.progress(0, text = "Preprocessing data ...")
|
| 45 |
startTime = time.time()
|
|
|
|
| 71 |
st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}')
|
| 72 |
return df
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def clickClassify():
|
| 75 |
st.session_state.processClf = True
|
| 76 |
|
|
|
|
| 259 |
|
| 260 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
| 261 |
def rankResumes(text, df):
|
|
|
|
| 262 |
progressBar = st.progress(0)
|
| 263 |
progressBar.progress(0, text = "Preprocessing data ...")
|
| 264 |
startTime = time.time()
|
|
|
|
| 301 |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
|
| 302 |
return df
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
def writeGettingStarted():
|
| 305 |
st.write("""
|
| 306 |
## Hello, Welcome!
|
|
|
|
| 325 |
The organization of columns is up to you but ensure that the "Resume" column is present.
|
| 326 |
The values under this column should include all the relevant details for each resume.
|
| 327 |
""")
|
| 328 |
+
st.info("""
|
| 329 |
+
##### NOTE:
|
| 330 |
+
- If the "Resume" column is not present, the classification/ranking process will not be executed.
|
| 331 |
+
- If there are multiple "Resume" columns, the first occurrence will be taken into account while the remaining duplicates are given a different column name.
|
| 332 |
+
""")
|
| 333 |
st.divider()
|
| 334 |
st.write("""
|
| 335 |
## Demo Walkthrough
|