JobShield-AI / matching_engine.py
shravanijadhav264's picture
Fix jobs.csv path
0de3125
import pandas as pd
import numpy as np
import os
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# ---------------- MODEL ----------------
model = SentenceTransformer('all-MiniLM-L6-v2')
# ---------------- DATA ----------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
jobs_path = os.path.join(BASE_DIR,"dataset", "jobs.csv")
jobs_df = pd.read_csv(jobs_path)
jobs_df["title"] = jobs_df["title"].astype(str)
# CLEAN TITLES
jobs_df["title"] = jobs_df["title"].apply(lambda x: " ".join(x.split()[:3]))
jobs_df["title"] = jobs_df["title"].str.replace(r"\s+", " ", regex=True).str.strip()
# ---------------- CLEAN TEXT ----------------
def clean(text):
text = str(text).lower()
text = re.sub(r"[^a-zA-Z0-9+.# ]", " ", text)
return text
# ---------------- SKILL DATABASE ----------------
SKILL_DB = [
"python","java","c++","sql","machine learning","deep learning",
"flask","django","html","css","javascript","react","node",
"pandas","numpy","tensorflow","keras","nlp","api","git","mongodb",
"mysql","scikit-learn","bootstrap","rest api","fastapi",
"postgresql"
]
def extract_skills(text):
text = text.lower()
return list({skill for skill in SKILL_DB if skill in text})
# ---------------- SKILL SCORE ----------------
def skill_score(resume, jd):
resume_skills = extract_skills(resume)
jd_skills = extract_skills(jd)
if not jd_skills:
return 0
match = len(set(resume_skills) & set(jd_skills))
return (match / len(jd_skills)) * 100
# ---------------- KEYWORD SCORE ----------------
def keyword_score(resume, jd):
stopwords = {
"the","and","for","with","this","that","you",
"our","your","are","have","has","will","from",
"job","work","team","using","required"
}
resume_words = {
w for w in clean(resume).split()
if len(w) > 2 and w not in stopwords
}
jd_words = {
w for w in clean(jd).split()
if len(w) > 2 and w not in stopwords
}
if not jd_words:
return 0
match = len(resume_words & jd_words)
return (match / len(jd_words)) * 100
# ---------------- SEMANTIC SCORE ----------------
def semantic_score(resume, jd):
vectors = model.encode([resume, jd])
sim = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
return max(0, (sim + 1) / 2 * 100)
# ---------------- FINAL SCORE ----------------
def calculate_similarity(resume, jd):
sem = semantic_score(resume, jd)
skill = skill_score(resume, jd)
keyword = keyword_score(resume, jd)
final = (0.35 * sem) + (0.4 * skill) + (0.25 * keyword)
# IMPORTANT PENALTY
if skill < 10:
final *= 0.75
elif skill < 25:
final *= 0.9
return round(final, 2)
# ---------------- JOB VECTORS ----------------
job_vectors = model.encode(
jobs_df["title"].tolist(),
show_progress_bar=True
)
# ---------------- JOB MATCHING ----------------
def get_top_job_matches(resume_text):
resume_vector = model.encode([resume_text])
scores = cosine_similarity(resume_vector, job_vectors)[0]
top_indices = np.argsort(scores)[::-1][:5]
results = []
for i in top_indices:
title = str(jobs_df.iloc[i]["title"])
title = " ".join(title.split()[:3])
results.append({
"title": title,
"match": round(float(scores[i]) * 100, 2)
})
return results