chatbot / app.py
DevNumb's picture
Update app.py
362982f verified
import os
import fitz
import docx
import numpy as np
import gradio as gr
import re
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
# -----------------------
# MODELS (better choices)
# -----------------------
bi_encoder = SentenceTransformer("BAAI/bge-base-en") # better embeddings
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
# -----------------------
# TEXT EXTRACTION
# -----------------------
def extract_text(file_path):
if file_path.endswith(".pdf"):
text = ""
with fitz.open(file_path) as doc:
for page in doc:
text += page.get_text()
return text
if file_path.endswith(".docx"):
d = docx.Document(file_path)
return "\n".join(p.text for p in d.paragraphs)
return ""
# -----------------------
# CLEANING
# -----------------------
def clean_text(t):
t = t.lower()
t = re.sub(r"\s+", " ", t)
return t
# -----------------------
# CHUNK EMBEDDINGS (IMPORTANT)
# -----------------------
def embed_chunks(text, size=400):
chunks = [text[i:i+size] for i in range(0, len(text), size)]
embs = bi_encoder.encode(chunks)
return np.mean(embs, axis=0)
# -----------------------
# SKILL MATCHING
# -----------------------
SKILLS = [
"python","java","sql","aws","docker","kubernetes",
"machine learning","pytorch","tensorflow",
"react","node","linux"
]
def skill_score(job, cv):
job_skills = [s for s in SKILLS if s in job]
if not job_skills:
return 0
matched = sum(s in cv for s in job_skills)
return matched / len(job_skills)
# -----------------------
# EXPERIENCE EXTRACTION (simple rule)
# -----------------------
def extract_years(text):
nums = re.findall(r"(\d+)\+?\s+years?", text)
return max([int(n) for n in nums], default=0)
# -----------------------
# MAIN RANKING
# -----------------------
def rank_cvs(job_description, files):
if not files:
return "Upload CVs."
job_description = clean_text(job_description)
# embed job once
job_emb = embed_chunks(job_description)
candidates = []
# ----------------
# Stage 1: Fast retrieval
# ----------------
for f in files:
name = os.path.basename(f)
text = clean_text(extract_text(f))
if not text:
continue
emb = embed_chunks(text)
sim = cosine_similarity([job_emb], [emb])[0][0]
candidates.append({
"name": name,
"text": text,
"sim": sim
})
# shortlist top 20
candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20]
# ----------------
# Stage 2: Cross-encoder rerank (accuracy boost)
# ----------------
pairs = [[job_description, c["text"][:3000]] for c in candidates]
ce_scores = cross_encoder.predict(pairs)
for c, ce in zip(candidates, ce_scores):
c["ce"] = ce
# ----------------
# Stage 3: Business logic scoring
# ----------------
for c in candidates:
s_score = skill_score(job_description, c["text"])
years = extract_years(c["text"])
final = (
0.5 * c["ce"] + # semantic accuracy
0.3 * s_score + # skills
0.2 * min(years/10,1) # experience
)
c["final"] = final
# ----------------
# sort final
# ----------------
candidates = sorted(candidates, key=lambda x: x["final"], reverse=True)
# ----------------
# Explainable output
# ----------------
output = ""
for i, c in enumerate(candidates[:10]):
output += (
f"### {i+1}. {c['name']}\n"
f"- Final Score: {c['final']:.3f}\n"
f"- Semantic: {c['ce']:.3f}\n"
f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n"
f"- Years: {extract_years(c['text'])}\n\n"
)
return output
# -----------------------
# UI
# -----------------------
demo = gr.Interface(
fn=rank_cvs,
inputs=[
gr.Textbox(label="Job Description", lines=6),
gr.File(file_count="multiple", type="filepath")
],
outputs=gr.Markdown(),
title="Production CV Ranker"
)
if __name__ == "__main__":
demo.launch()