Spaces:

DevNumb
/

chatbot

Running

File size: 4,333 Bytes

import os
import fitz
import docx
import numpy as np
import gradio as gr
import re
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------
# MODELS (better choices)
# -----------------------

bi_encoder = SentenceTransformer("BAAI/bge-base-en")  # better embeddings
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


# -----------------------
# TEXT EXTRACTION
# -----------------------

def extract_text(file_path):
    if file_path.endswith(".pdf"):
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text

    if file_path.endswith(".docx"):
        d = docx.Document(file_path)
        return "\n".join(p.text for p in d.paragraphs)

    return ""


# -----------------------
# CLEANING
# -----------------------

def clean_text(t):
    t = t.lower()
    t = re.sub(r"\s+", " ", t)
    return t


# -----------------------
# CHUNK EMBEDDINGS (IMPORTANT)
# -----------------------

def embed_chunks(text, size=400):
    chunks = [text[i:i+size] for i in range(0, len(text), size)]
    embs = bi_encoder.encode(chunks)
    return np.mean(embs, axis=0)


# -----------------------
# SKILL MATCHING
# -----------------------

SKILLS = [
    "python","java","sql","aws","docker","kubernetes",
    "machine learning","pytorch","tensorflow",
    "react","node","linux"
]

def skill_score(job, cv):
    job_skills = [s for s in SKILLS if s in job]
    if not job_skills:
        return 0
    matched = sum(s in cv for s in job_skills)
    return matched / len(job_skills)


# -----------------------
# EXPERIENCE EXTRACTION (simple rule)
# -----------------------

def extract_years(text):
    nums = re.findall(r"(\d+)\+?\s+years?", text)
    return max([int(n) for n in nums], default=0)


# -----------------------
# MAIN RANKING
# -----------------------

def rank_cvs(job_description, files):

    if not files:
        return "Upload CVs."

    job_description = clean_text(job_description)

    # embed job once
    job_emb = embed_chunks(job_description)

    candidates = []

    # ----------------
    # Stage 1: Fast retrieval
    # ----------------
    for f in files:
        name = os.path.basename(f)

        text = clean_text(extract_text(f))
        if not text:
            continue

        emb = embed_chunks(text)

        sim = cosine_similarity([job_emb], [emb])[0][0]

        candidates.append({
            "name": name,
            "text": text,
            "sim": sim
        })

    # shortlist top 20
    candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20]


    # ----------------
    # Stage 2: Cross-encoder rerank (accuracy boost)
    # ----------------
    pairs = [[job_description, c["text"][:3000]] for c in candidates]
    ce_scores = cross_encoder.predict(pairs)

    for c, ce in zip(candidates, ce_scores):
        c["ce"] = ce


    # ----------------
    # Stage 3: Business logic scoring
    # ----------------
    for c in candidates:

        s_score = skill_score(job_description, c["text"])
        years = extract_years(c["text"])

        final = (
            0.5 * c["ce"] +        # semantic accuracy
            0.3 * s_score +       # skills
            0.2 * min(years/10,1) # experience
        )

        c["final"] = final


    # ----------------
    # sort final
    # ----------------
    candidates = sorted(candidates, key=lambda x: x["final"], reverse=True)


    # ----------------
    # Explainable output
    # ----------------
    output = ""
    for i, c in enumerate(candidates[:10]):
        output += (
            f"### {i+1}. {c['name']}\n"
            f"- Final Score: {c['final']:.3f}\n"
            f"- Semantic: {c['ce']:.3f}\n"
            f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n"
            f"- Years: {extract_years(c['text'])}\n\n"
        )

    return output


# -----------------------
# UI
# -----------------------

demo = gr.Interface(
    fn=rank_cvs,
    inputs=[
        gr.Textbox(label="Job Description", lines=6),
        gr.File(file_count="multiple", type="filepath")
    ],
    outputs=gr.Markdown(),
    title="Production CV Ranker"
)

if __name__ == "__main__":
    demo.launch()