Spaces:

DevNumb
/

chatbot

Sleeping

App Files Files Community

DevNumb commited on Feb 6

Commit

362982f

verified ·

1 Parent(s): 265e52e

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -45

app.py CHANGED Viewed

@@ -1,74 +1,188 @@
 import os
-import gradio as gr
-import fitz  # PyMuPDF
 import docx
 import numpy as np
-from sentence_transformers import SentenceTransformer
-# 1. Load a pretrained Sentence Transformer model locally
-model = SentenceTransformer("all-MiniLM-L6-v2")
-# ---- Text extraction ----
-def extract_text(file_path, filename):
-    if filename.endswith(".pdf"):
         text = ""
         with fitz.open(file_path) as doc:
             for page in doc:
-                text += page.get_text("text") + "\n"
         return text
-    elif filename.endswith(".docx"):
-        docf = docx.Document(file_path)
-        return "\n".join(p.text for p in docf.paragraphs)
     return ""
-# ---- Local embedding helper ----
-def get_embedding(text):
-    # Use the locally loaded model to generate embeddings
-    embedding = model.encode(text)
-    return np.array(embedding)
-# ---- CV ranking ----
 def rank_cvs(job_description, files):
-    if not job_description or not files:
-        return "⚠️ Please upload CVs and provide a job description."
-    job_emb = get_embedding(job_description)
-    scores, names = [], []
-    for file_path in files:
-        # Get filename from file path
-        filename = os.path.basename(file_path)
-        text = extract_text(file_path, filename)
-        if not text.strip():
             continue
-        cv_emb = get_embedding(text[:4000])  # limit text length
-        sim = np.dot(job_emb, cv_emb) / (
-            np.linalg.norm(job_emb) * np.linalg.norm(cv_emb)
         )
-        scores.append(sim)
-        names.append(filename)
-    top = sorted(zip(names, scores), key=lambda x: x[1], reverse=True)[:10]
-    return "\n\n".join(
-        [f"**{i+1}. {n}** — Similarity: `{s:.3f}`" for i, (n, s) in enumerate(top)]
-    )
-# ---- Gradio UI ----
 demo = gr.Interface(
     fn=rank_cvs,
     inputs=[
-        gr.Textbox(label="💼 Job Description", lines=5),
-        gr.File(label="📁 Upload CVs (PDF/DOCX)", file_count="multiple", type="filepath"),
     ],
     outputs=gr.Markdown(),
-    title="📄 AI CV Ranker (Local Model)",
-    description="Ranks uploaded CVs based on job relevance using local SentenceTransformer model.",
 )
 if __name__ == "__main__":
-    demo.launch()

 import os
+import fitz
 import docx
 import numpy as np
+import gradio as gr
+import re
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from sklearn.metrics.pairwise import cosine_similarity
+# -----------------------
+# MODELS (better choices)
+# -----------------------
+bi_encoder = SentenceTransformer("BAAI/bge-base-en")  # better embeddings
+cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+# -----------------------
+# TEXT EXTRACTION
+# -----------------------
+def extract_text(file_path):
+    if file_path.endswith(".pdf"):
         text = ""
         with fitz.open(file_path) as doc:
             for page in doc:
+                text += page.get_text()
         return text
+    if file_path.endswith(".docx"):
+        d = docx.Document(file_path)
+        return "\n".join(p.text for p in d.paragraphs)
     return ""
+# -----------------------
+# CLEANING
+# -----------------------
+def clean_text(t):
+    t = t.lower()
+    t = re.sub(r"\s+", " ", t)
+    return t
+# -----------------------
+# CHUNK EMBEDDINGS (IMPORTANT)
+# -----------------------
+def embed_chunks(text, size=400):
+    chunks = [text[i:i+size] for i in range(0, len(text), size)]
+    embs = bi_encoder.encode(chunks)
+    return np.mean(embs, axis=0)
+# -----------------------
+# SKILL MATCHING
+# -----------------------
+SKILLS = [
+    "python","java","sql","aws","docker","kubernetes",
+    "machine learning","pytorch","tensorflow",
+    "react","node","linux"
+]
+def skill_score(job, cv):
+    job_skills = [s for s in SKILLS if s in job]
+    if not job_skills:
+        return 0
+    matched = sum(s in cv for s in job_skills)
+    return matched / len(job_skills)
+# -----------------------
+# EXPERIENCE EXTRACTION (simple rule)
+# -----------------------
+def extract_years(text):
+    nums = re.findall(r"(\d+)\+?\s+years?", text)
+    return max([int(n) for n in nums], default=0)
+# -----------------------
+# MAIN RANKING
+# -----------------------
 def rank_cvs(job_description, files):
+    if not files:
+        return "Upload CVs."
+    job_description = clean_text(job_description)
+    # embed job once
+    job_emb = embed_chunks(job_description)
+    candidates = []
+    # ----------------
+    # Stage 1: Fast retrieval
+    # ----------------
+    for f in files:
+        name = os.path.basename(f)
+        text = clean_text(extract_text(f))
+        if not text:
             continue
+        emb = embed_chunks(text)
+        sim = cosine_similarity([job_emb], [emb])[0][0]
+        candidates.append({
+            "name": name,
+            "text": text,
+            "sim": sim
+        })
+    # shortlist top 20
+    candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20]
+    # ----------------
+    # Stage 2: Cross-encoder rerank (accuracy boost)
+    # ----------------
+    pairs = [[job_description, c["text"][:3000]] for c in candidates]
+    ce_scores = cross_encoder.predict(pairs)
+    for c, ce in zip(candidates, ce_scores):
+        c["ce"] = ce
+    # ----------------
+    # Stage 3: Business logic scoring
+    # ----------------
+    for c in candidates:
+        s_score = skill_score(job_description, c["text"])
+        years = extract_years(c["text"])
+        final = (
+            0.5 * c["ce"] +        # semantic accuracy
+            0.3 * s_score +       # skills
+            0.2 * min(years/10,1) # experience
+        )
+        c["final"] = final
+    # ----------------
+    # sort final
+    # ----------------
+    candidates = sorted(candidates, key=lambda x: x["final"], reverse=True)
+    # ----------------
+    # Explainable output
+    # ----------------
+    output = ""
+    for i, c in enumerate(candidates[:10]):
+        output += (
+            f"### {i+1}. {c['name']}\n"
+            f"- Final Score: {c['final']:.3f}\n"
+            f"- Semantic: {c['ce']:.3f}\n"
+            f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n"
+            f"- Years: {extract_years(c['text'])}\n\n"
         )
+    return output
+# -----------------------
+# UI
+# -----------------------
 demo = gr.Interface(
     fn=rank_cvs,
     inputs=[
+        gr.Textbox(label="Job Description", lines=6),
+        gr.File(file_count="multiple", type="filepath")
     ],
     outputs=gr.Markdown(),
+    title="Production CV Ranker"
 )
 if __name__ == "__main__":
+    demo.launch()