|
|
import os |
|
|
import fitz |
|
|
import docx |
|
|
import numpy as np |
|
|
import gradio as gr |
|
|
import re |
|
|
from sentence_transformers import SentenceTransformer, CrossEncoder |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bi_encoder = SentenceTransformer("BAAI/bge-base-en") |
|
|
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_text(file_path): |
|
|
if file_path.endswith(".pdf"): |
|
|
text = "" |
|
|
with fitz.open(file_path) as doc: |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
return text |
|
|
|
|
|
if file_path.endswith(".docx"): |
|
|
d = docx.Document(file_path) |
|
|
return "\n".join(p.text for p in d.paragraphs) |
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_text(t): |
|
|
t = t.lower() |
|
|
t = re.sub(r"\s+", " ", t) |
|
|
return t |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def embed_chunks(text, size=400): |
|
|
chunks = [text[i:i+size] for i in range(0, len(text), size)] |
|
|
embs = bi_encoder.encode(chunks) |
|
|
return np.mean(embs, axis=0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SKILLS = [ |
|
|
"python","java","sql","aws","docker","kubernetes", |
|
|
"machine learning","pytorch","tensorflow", |
|
|
"react","node","linux" |
|
|
] |
|
|
|
|
|
def skill_score(job, cv): |
|
|
job_skills = [s for s in SKILLS if s in job] |
|
|
if not job_skills: |
|
|
return 0 |
|
|
matched = sum(s in cv for s in job_skills) |
|
|
return matched / len(job_skills) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_years(text): |
|
|
nums = re.findall(r"(\d+)\+?\s+years?", text) |
|
|
return max([int(n) for n in nums], default=0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rank_cvs(job_description, files): |
|
|
|
|
|
if not files: |
|
|
return "Upload CVs." |
|
|
|
|
|
job_description = clean_text(job_description) |
|
|
|
|
|
|
|
|
job_emb = embed_chunks(job_description) |
|
|
|
|
|
candidates = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for f in files: |
|
|
name = os.path.basename(f) |
|
|
|
|
|
text = clean_text(extract_text(f)) |
|
|
if not text: |
|
|
continue |
|
|
|
|
|
emb = embed_chunks(text) |
|
|
|
|
|
sim = cosine_similarity([job_emb], [emb])[0][0] |
|
|
|
|
|
candidates.append({ |
|
|
"name": name, |
|
|
"text": text, |
|
|
"sim": sim |
|
|
}) |
|
|
|
|
|
|
|
|
candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pairs = [[job_description, c["text"][:3000]] for c in candidates] |
|
|
ce_scores = cross_encoder.predict(pairs) |
|
|
|
|
|
for c, ce in zip(candidates, ce_scores): |
|
|
c["ce"] = ce |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for c in candidates: |
|
|
|
|
|
s_score = skill_score(job_description, c["text"]) |
|
|
years = extract_years(c["text"]) |
|
|
|
|
|
final = ( |
|
|
0.5 * c["ce"] + |
|
|
0.3 * s_score + |
|
|
0.2 * min(years/10,1) |
|
|
) |
|
|
|
|
|
c["final"] = final |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
candidates = sorted(candidates, key=lambda x: x["final"], reverse=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = "" |
|
|
for i, c in enumerate(candidates[:10]): |
|
|
output += ( |
|
|
f"### {i+1}. {c['name']}\n" |
|
|
f"- Final Score: {c['final']:.3f}\n" |
|
|
f"- Semantic: {c['ce']:.3f}\n" |
|
|
f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n" |
|
|
f"- Years: {extract_years(c['text'])}\n\n" |
|
|
) |
|
|
|
|
|
return output |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=rank_cvs, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Job Description", lines=6), |
|
|
gr.File(file_count="multiple", type="filepath") |
|
|
], |
|
|
outputs=gr.Markdown(), |
|
|
title="Production CV Ranker" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|