Spaces:

Novricana25
/

AIResumeOptimiser

Sleeping

App Files Files Community

AIResumeOptimiser / ats_system.py

Novricana25

Upload 3 files

6adeb94 verified 5 months ago

raw

history blame contribute delete

6.56 kB


	import re
	import nltk
	from nltk.corpus import stopwords
	from sentence_transformers import SentenceTransformer
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	# Download stopwords if not already downloaded
	try:
	stopwords.words('english')
	except LookupError:
	nltk.download('stopwords')

	# 1. Preprocessing Function
	def preprocess_text(text):
	text = text.lower() # Lowercasing
	text = re.sub(r'[^ɗ\w\s]', '', text) # Remove punctuation
	text = re.sub(r'\d+', '', text) # Remove numbers
	text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters
	text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespaces

	words = text.split() # Tokenize
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words] # Remove stopwords

	return ' '.join(words)

	# 2. BERT Model and Embedding Function
	# Load the pre-trained BERT model once
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	def get_embeddings(texts):
	return model.encode(texts, show_progress_bar=False)

	# 3. Keyword Extraction Setup (Vectorizers)
	# Instantiate TfidfVectorizer and CountVectorizer globally for reusability
	tfidf_vectorizer = TfidfVectorizer()
	count_vectorizer = CountVectorizer()

	# Placeholder for fitting vectorizers - these would ideally be fitted on a large corpus
	# and then saved/loaded, but for this demo, we'll refit them on combined sample data
	# or ensure they are fit when the script is run in a standalone context

	# 4. Keyword Extraction Functions
	def extract_keywords_tfidf(tfidf_vectorizer_obj, tfidf_matrix, top_n=5):
	feature_names = tfidf_vectorizer_obj.get_feature_names_out()
	keywords_list = []
	for i in range(tfidf_matrix.shape[0]): # Iterate through each document
	row = tfidf_matrix[i, :].toarray().flatten()
	top_n_indices = row.argsort()[-top_n:][::-1]
	top_keywords = [feature_names[idx] for idx in top_n_indices if row[idx] > 0]
	keywords_list.append(top_keywords)
	return keywords_list

	def extract_keywords_countvectorizer(count_vectorizer_obj, count_matrix, top_n=5):
	feature_names = count_vectorizer_obj.get_feature_names_out()
	keywords_list = []
	for i in range(count_matrix.shape[0]): # Iterate through each document
	row = count_matrix[i, :].toarray().flatten()
	top_n_indices = row.argsort()[-top_n:][::-1]
	top_keywords = [feature_names[idx] for idx in top_n_indices if row[idx] > 0]
	keywords_list.append(top_keywords)
	return keywords_list

	# 5. Similarity Scoring Function
	def calculate_cosine_similarity(embeddings1, embeddings2):
	embeddings1 = np.array(embeddings1)
	embeddings2 = np.array(embeddings2)
	similarity_matrix = cosine_similarity(embeddings1, embeddings2)
	return similarity_matrix

	# 6. Missing Keywords Identification
	def identify_missing_keywords(resume_keywords_list, job_desc_keywords_list):
	resume_set = set(resume_keywords_list)
	job_desc_set = set(job_desc_keywords_list)
	missing = list(job_desc_set - resume_set)
	return missing

	# 7. Feedback Generation
	def generate_feedback(similarity_score, missing_keywords):
	feedback = f"Overall Resume-Job Description Match Score: {similarity_score:.2f} (out of 1.00)\n\n"

	if similarity_score >= 0.7:
	feedback += "Excellent match! Your resume aligns very well with the job description. " \
	"Consider fine-tuning minor details for an even stronger application.\n"
	elif similarity_score >= 0.5:
	feedback += "Good match. Your resume shows strong relevance to the job. " \
	"Focus on strengthening specific areas for better alignment.\n"
	else:
	feedback += "Moderate match. There's potential for improvement to better align " \
	"your resume with the job description. Consider enhancing key sections.\n"

	if missing_keywords:
	feedback += "\nTo improve your match, consider incorporating the following keywords, " \
	"which were prominent in the job description but not explicitly found in your resume: "
	feedback += ", ".join(missing_keywords) + ".\n"
	else:
	feedback += "\nNo significant missing keywords identified from the job description. " \
	"Your resume covers key terms effectively.\n"

	feedback += "\nRemember to tailor your resume's experience and skills sections to highlight these areas."
	return feedback

	# 8. Main ATS Analysis Report Function
	def get_ats_analysis_report(resume_text_raw, job_desc_text_raw, tfidf_vectorizer_fit, count_vectorizer_fit):
	# a. Preprocess texts
	cleaned_resume = preprocess_text(resume_text_raw)
	cleaned_job_desc = preprocess_text(job_desc_text_raw)

	# b. Generate semantic embeddings
	resume_embedding = get_embeddings([cleaned_resume])
	job_desc_embedding = get_embeddings([cleaned_job_desc])

	# c. Calculate cosine similarity
	similarity_score = calculate_cosine_similarity(resume_embedding, job_desc_embedding)[0][0]

	# d. Transform cleaned texts into TF-IDF matrices using the already fitted vectorizer
	resume_tfidf_single = tfidf_vectorizer_fit.transform([cleaned_resume])
	job_desc_tfidf_single = tfidf_vectorizer_fit.transform([cleaned_job_desc])

	# e. Extract keywords from single-document TF-IDF matrices
	resume_keywords_single = extract_keywords_tfidf(tfidf_vectorizer_fit, resume_tfidf_single)[0]
	job_desc_keywords_single = extract_keywords_tfidf(tfidf_vectorizer_fit, job_desc_tfidf_single)[0]

	# f. Identify missing keywords
	missing_keywords = identify_missing_keywords(resume_keywords_single, job_desc_keywords_single)

	# g. Generate constructive feedback
	feedback = generate_feedback(similarity_score, missing_keywords)

	# h. Return a dictionary containing all relevant information
	report = {
	"original_resume_text": resume_text_raw,
	"original_job_description_text": job_desc_text_raw,
	"cleaned_resume_text": cleaned_resume,
	"cleaned_job_description_text": cleaned_job_desc,
	"resume_keywords_tfidf": resume_keywords_single,
	"job_description_keywords_tfidf": job_desc_keywords_single,
	"semantic_similarity_score": float(similarity_score),
	"missing_keywords": missing_keywords,
	"feedback": feedback
	}
	return report

	print("ats_system.py created successfully.")