import pandas as pd import numpy as np import os import re from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity # ---------------- MODEL ---------------- model = SentenceTransformer('all-MiniLM-L6-v2') # ---------------- DATA ---------------- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) jobs_path = os.path.join(BASE_DIR,"dataset", "jobs.csv") jobs_df = pd.read_csv(jobs_path) jobs_df["title"] = jobs_df["title"].astype(str) # CLEAN TITLES jobs_df["title"] = jobs_df["title"].apply(lambda x: " ".join(x.split()[:3])) jobs_df["title"] = jobs_df["title"].str.replace(r"\s+", " ", regex=True).str.strip() # ---------------- CLEAN TEXT ---------------- def clean(text): text = str(text).lower() text = re.sub(r"[^a-zA-Z0-9+.# ]", " ", text) return text # ---------------- SKILL DATABASE ---------------- SKILL_DB = [ "python","java","c++","sql","machine learning","deep learning", "flask","django","html","css","javascript","react","node", "pandas","numpy","tensorflow","keras","nlp","api","git","mongodb", "mysql","scikit-learn","bootstrap","rest api","fastapi", "postgresql" ] def extract_skills(text): text = text.lower() return list({skill for skill in SKILL_DB if skill in text}) # ---------------- SKILL SCORE ---------------- def skill_score(resume, jd): resume_skills = extract_skills(resume) jd_skills = extract_skills(jd) if not jd_skills: return 0 match = len(set(resume_skills) & set(jd_skills)) return (match / len(jd_skills)) * 100 # ---------------- KEYWORD SCORE ---------------- def keyword_score(resume, jd): stopwords = { "the","and","for","with","this","that","you", "our","your","are","have","has","will","from", "job","work","team","using","required" } resume_words = { w for w in clean(resume).split() if len(w) > 2 and w not in stopwords } jd_words = { w for w in clean(jd).split() if len(w) > 2 and w not in stopwords } if not jd_words: return 0 match = len(resume_words & jd_words) return (match / len(jd_words)) * 100 # ---------------- SEMANTIC SCORE ---------------- def semantic_score(resume, jd): vectors = model.encode([resume, jd]) sim = cosine_similarity([vectors[0]], [vectors[1]])[0][0] return max(0, (sim + 1) / 2 * 100) # ---------------- FINAL SCORE ---------------- def calculate_similarity(resume, jd): sem = semantic_score(resume, jd) skill = skill_score(resume, jd) keyword = keyword_score(resume, jd) final = (0.35 * sem) + (0.4 * skill) + (0.25 * keyword) # IMPORTANT PENALTY if skill < 10: final *= 0.75 elif skill < 25: final *= 0.9 return round(final, 2) # ---------------- JOB VECTORS ---------------- job_vectors = model.encode( jobs_df["title"].tolist(), show_progress_bar=True ) # ---------------- JOB MATCHING ---------------- def get_top_job_matches(resume_text): resume_vector = model.encode([resume_text]) scores = cosine_similarity(resume_vector, job_vectors)[0] top_indices = np.argsort(scores)[::-1][:5] results = [] for i in top_indices: title = str(jobs_df.iloc[i]["title"]) title = " ".join(title.split()[:3]) results.append({ "title": title, "match": round(float(scores[i]) * 100, 2) }) return results