Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # ---------------- MODEL ---------------- | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # ---------------- DATA ---------------- | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| jobs_path = os.path.join(BASE_DIR,"dataset", "jobs.csv") | |
| jobs_df = pd.read_csv(jobs_path) | |
| jobs_df["title"] = jobs_df["title"].astype(str) | |
| # CLEAN TITLES | |
| jobs_df["title"] = jobs_df["title"].apply(lambda x: " ".join(x.split()[:3])) | |
| jobs_df["title"] = jobs_df["title"].str.replace(r"\s+", " ", regex=True).str.strip() | |
| # ---------------- CLEAN TEXT ---------------- | |
| def clean(text): | |
| text = str(text).lower() | |
| text = re.sub(r"[^a-zA-Z0-9+.# ]", " ", text) | |
| return text | |
| # ---------------- SKILL DATABASE ---------------- | |
| SKILL_DB = [ | |
| "python","java","c++","sql","machine learning","deep learning", | |
| "flask","django","html","css","javascript","react","node", | |
| "pandas","numpy","tensorflow","keras","nlp","api","git","mongodb", | |
| "mysql","scikit-learn","bootstrap","rest api","fastapi", | |
| "postgresql" | |
| ] | |
| def extract_skills(text): | |
| text = text.lower() | |
| return list({skill for skill in SKILL_DB if skill in text}) | |
| # ---------------- SKILL SCORE ---------------- | |
| def skill_score(resume, jd): | |
| resume_skills = extract_skills(resume) | |
| jd_skills = extract_skills(jd) | |
| if not jd_skills: | |
| return 0 | |
| match = len(set(resume_skills) & set(jd_skills)) | |
| return (match / len(jd_skills)) * 100 | |
| # ---------------- KEYWORD SCORE ---------------- | |
| def keyword_score(resume, jd): | |
| stopwords = { | |
| "the","and","for","with","this","that","you", | |
| "our","your","are","have","has","will","from", | |
| "job","work","team","using","required" | |
| } | |
| resume_words = { | |
| w for w in clean(resume).split() | |
| if len(w) > 2 and w not in stopwords | |
| } | |
| jd_words = { | |
| w for w in clean(jd).split() | |
| if len(w) > 2 and w not in stopwords | |
| } | |
| if not jd_words: | |
| return 0 | |
| match = len(resume_words & jd_words) | |
| return (match / len(jd_words)) * 100 | |
| # ---------------- SEMANTIC SCORE ---------------- | |
| def semantic_score(resume, jd): | |
| vectors = model.encode([resume, jd]) | |
| sim = cosine_similarity([vectors[0]], [vectors[1]])[0][0] | |
| return max(0, (sim + 1) / 2 * 100) | |
| # ---------------- FINAL SCORE ---------------- | |
| def calculate_similarity(resume, jd): | |
| sem = semantic_score(resume, jd) | |
| skill = skill_score(resume, jd) | |
| keyword = keyword_score(resume, jd) | |
| final = (0.35 * sem) + (0.4 * skill) + (0.25 * keyword) | |
| # IMPORTANT PENALTY | |
| if skill < 10: | |
| final *= 0.75 | |
| elif skill < 25: | |
| final *= 0.9 | |
| return round(final, 2) | |
| # ---------------- JOB VECTORS ---------------- | |
| job_vectors = model.encode( | |
| jobs_df["title"].tolist(), | |
| show_progress_bar=True | |
| ) | |
| # ---------------- JOB MATCHING ---------------- | |
| def get_top_job_matches(resume_text): | |
| resume_vector = model.encode([resume_text]) | |
| scores = cosine_similarity(resume_vector, job_vectors)[0] | |
| top_indices = np.argsort(scores)[::-1][:5] | |
| results = [] | |
| for i in top_indices: | |
| title = str(jobs_df.iloc[i]["title"]) | |
| title = " ".join(title.split()[:3]) | |
| results.append({ | |
| "title": title, | |
| "match": round(float(scores[i]) * 100, 2) | |
| }) | |
| return results |