import os import fitz import docx import numpy as np import gradio as gr import re from sentence_transformers import SentenceTransformer, CrossEncoder from sklearn.metrics.pairwise import cosine_similarity # ----------------------- # MODELS (better choices) # ----------------------- bi_encoder = SentenceTransformer("BAAI/bge-base-en") # better embeddings cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # ----------------------- # TEXT EXTRACTION # ----------------------- def extract_text(file_path): if file_path.endswith(".pdf"): text = "" with fitz.open(file_path) as doc: for page in doc: text += page.get_text() return text if file_path.endswith(".docx"): d = docx.Document(file_path) return "\n".join(p.text for p in d.paragraphs) return "" # ----------------------- # CLEANING # ----------------------- def clean_text(t): t = t.lower() t = re.sub(r"\s+", " ", t) return t # ----------------------- # CHUNK EMBEDDINGS (IMPORTANT) # ----------------------- def embed_chunks(text, size=400): chunks = [text[i:i+size] for i in range(0, len(text), size)] embs = bi_encoder.encode(chunks) return np.mean(embs, axis=0) # ----------------------- # SKILL MATCHING # ----------------------- SKILLS = [ "python","java","sql","aws","docker","kubernetes", "machine learning","pytorch","tensorflow", "react","node","linux" ] def skill_score(job, cv): job_skills = [s for s in SKILLS if s in job] if not job_skills: return 0 matched = sum(s in cv for s in job_skills) return matched / len(job_skills) # ----------------------- # EXPERIENCE EXTRACTION (simple rule) # ----------------------- def extract_years(text): nums = re.findall(r"(\d+)\+?\s+years?", text) return max([int(n) for n in nums], default=0) # ----------------------- # MAIN RANKING # ----------------------- def rank_cvs(job_description, files): if not files: return "Upload CVs." job_description = clean_text(job_description) # embed job once job_emb = embed_chunks(job_description) candidates = [] # ---------------- # Stage 1: Fast retrieval # ---------------- for f in files: name = os.path.basename(f) text = clean_text(extract_text(f)) if not text: continue emb = embed_chunks(text) sim = cosine_similarity([job_emb], [emb])[0][0] candidates.append({ "name": name, "text": text, "sim": sim }) # shortlist top 20 candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20] # ---------------- # Stage 2: Cross-encoder rerank (accuracy boost) # ---------------- pairs = [[job_description, c["text"][:3000]] for c in candidates] ce_scores = cross_encoder.predict(pairs) for c, ce in zip(candidates, ce_scores): c["ce"] = ce # ---------------- # Stage 3: Business logic scoring # ---------------- for c in candidates: s_score = skill_score(job_description, c["text"]) years = extract_years(c["text"]) final = ( 0.5 * c["ce"] + # semantic accuracy 0.3 * s_score + # skills 0.2 * min(years/10,1) # experience ) c["final"] = final # ---------------- # sort final # ---------------- candidates = sorted(candidates, key=lambda x: x["final"], reverse=True) # ---------------- # Explainable output # ---------------- output = "" for i, c in enumerate(candidates[:10]): output += ( f"### {i+1}. {c['name']}\n" f"- Final Score: {c['final']:.3f}\n" f"- Semantic: {c['ce']:.3f}\n" f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n" f"- Years: {extract_years(c['text'])}\n\n" ) return output # ----------------------- # UI # ----------------------- demo = gr.Interface( fn=rank_cvs, inputs=[ gr.Textbox(label="Job Description", lines=6), gr.File(file_count="multiple", type="filepath") ], outputs=gr.Markdown(), title="Production CV Ranker" ) if __name__ == "__main__": demo.launch()