Spaces:

midlajvalappil
/

Resume-Ranker-AI

Sleeping

App Files Files Community

Resume-Ranker-AI / src /utils /nlp_analyzer.py

midlajvalappil

Upload 8 files

35fa114 verified 10 months ago

raw

history blame contribute delete

7.67 kB

	import spacy
	import re
	from collections import Counter
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import subprocess
	import sys

	class NLPAnalyzer:
	"""Analyze resume and job description using NLP techniques."""

	def __init__(self):
	# Load spaCy model with fallback
	self.nlp = self._load_spacy_model()

	# Common words to exclude from keyword analysis
	if self.nlp:
	self.stop_words = self.nlp.Defaults.stop_words.union({
	"experience", "year", "years", "skill", "skills", "job",
	"work", "working", "candidate", "ability", "position"
	})
	else:
	# Fallback stop words if spaCy model fails to load
	self.stop_words = {
	"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
	"experience", "year", "years", "skill", "skills", "job", "work", "working", "candidate", "ability", "position"
	}

	def _load_spacy_model(self):
	"""Load spaCy model with fallback options."""
	models_to_try = ["en_core_web_md", "en_core_web_sm", "en_core_web_lg"]

	for model_name in models_to_try:
	try:
	nlp = spacy.load(model_name)
	print(f"Successfully loaded spaCy model: {model_name}")
	return nlp
	except OSError:
	print(f"Model {model_name} not found, trying next...")
	continue

	# If no models are available, try to download en_core_web_sm
	try:
	print("No spaCy models found. Attempting to download en_core_web_sm...")
	subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")
	print("Successfully downloaded and loaded en_core_web_sm")
	return nlp
	except Exception as e:
	print(f"Failed to download spaCy model: {e}")
	print("Running in fallback mode without spaCy NLP features")
	return None

	def preprocess_text(self, text):
	"""Clean and preprocess text."""
	if not text:
	return ""

	# Convert to lowercase and remove special characters
	text = re.sub(r'[^\w\s]', ' ', text.lower())
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def extract_keywords(self, text, max_keywords=30):
	"""Extract important keywords from text using spaCy or fallback method."""
	if self.nlp:
	# Use spaCy for advanced keyword extraction
	doc = self.nlp(self.preprocess_text(text))

	# Extract nouns, proper nouns, and adjectives that aren't stop words
	keywords = []
	for token in doc:
	if (token.pos_ in ["NOUN", "PROPN"] or
	(token.pos_ == "ADJ" and len(token.text) > 2)) and \
	not token.is_stop and token.text.lower() not in self.stop_words:
	keywords.append(token.text.lower())

	# Count frequencies and return top keywords
	keyword_freq = Counter(keywords)
	return [word for word, freq in keyword_freq.most_common(max_keywords)]
	else:
	# Fallback method without spaCy
	return self._extract_keywords_fallback(text, max_keywords)

	def _extract_keywords_fallback(self, text, max_keywords=30):
	"""Fallback keyword extraction without spaCy."""
	processed_text = self.preprocess_text(text)
	words = processed_text.split()

	# Filter out stop words and short words
	keywords = [word for word in words
	if len(word) > 2 and word.lower() not in self.stop_words]

	# Count frequencies and return top keywords
	keyword_freq = Counter(keywords)
	return [word for word, freq in keyword_freq.most_common(max_keywords)]

	def extract_skills(self, text):
	"""Extract skills from text using a combination of NER and keyword extraction."""
	skills = []

	if self.nlp:
	# Use spaCy for advanced skill extraction
	doc = self.nlp(self.preprocess_text(text))

	# Extract entities that might be skills
	for ent in doc.ents:
	if ent.label_ in ["ORG", "PRODUCT", "GPE"]:
	skills.append(ent.text.lower())

	# Add noun chunks that might represent skills
	for chunk in doc.noun_chunks:
	if not any(token.is_stop for token in chunk) and len(chunk.text) > 3:
	skills.append(chunk.text.lower())

	# Add technical terms and programming languages (works with or without spaCy)
	tech_pattern = r'\b(?:python\|java\|javascript\|html\|css\|sql\|react\|node\.js\|aws\|docker\|kubernetes\|machine learning\|ai\|data science\|nlp\|c\+\+\|ruby\|php\|swift\|golang\|scala\|r\|tableau\|power bi\|excel\|word\|powerpoint\|photoshop\|illustrator)\b'
	tech_skills = re.findall(tech_pattern, text.lower())
	skills.extend(tech_skills)

	# If no spaCy, add some basic skill extraction
	if not self.nlp:
	# Extract capitalized words that might be technologies/skills
	cap_words = re.findall(r'\b[A-Z][a-zA-Z]+\b', text)
	skills.extend([word.lower() for word in cap_words if len(word) > 2])

	# Remove duplicates and return
	return list(set(skills))

	def calculate_match_score(self, resume_text, job_text):
	"""Calculate match percentage between resume and job description."""
	if not resume_text or not job_text:
	return 0

	# Preprocess texts
	resume_processed = self.preprocess_text(resume_text)
	job_processed = self.preprocess_text(job_text)

	# Use TF-IDF vectorization for better semantic matching
	vectorizer = TfidfVectorizer(stop_words='english')
	try:
	tfidf_matrix = vectorizer.fit_transform([resume_processed, job_processed])
	# Calculate cosine similarity
	similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

	# Convert to percentage
	match_percentage = round(similarity * 100)

	# Ensure the score is between 0 and 100
	return max(0, min(match_percentage, 100))
	except:
	# Fallback if vectorization fails
	return self._calculate_keyword_match(resume_text, job_text)

	def _calculate_keyword_match(self, resume_text, job_text):
	"""Fallback method to calculate match based on keyword overlap."""
	job_keywords = set(self.extract_keywords(job_text, max_keywords=50))
	resume_keywords = set(self.extract_keywords(resume_text, max_keywords=100))

	if not job_keywords:
	return 0

	# Calculate overlap
	matching_keywords = resume_keywords.intersection(job_keywords)
	match_percentage = round((len(matching_keywords) / len(job_keywords)) * 100)

	return max(0, min(match_percentage, 100))

	def find_missing_keywords(self, resume_text, job_text):
	"""Identify keywords from job description missing in resume."""
	job_keywords = set(self.extract_keywords(job_text, max_keywords=50))
	resume_keywords = set(self.extract_keywords(resume_text, max_keywords=100))

	missing_keywords = job_keywords - resume_keywords
	return list(missing_keywords)