DevNumb commited on
Commit
362982f
·
verified ·
1 Parent(s): 265e52e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -45
app.py CHANGED
@@ -1,74 +1,188 @@
1
  import os
2
- import gradio as gr
3
- import fitz # PyMuPDF
4
  import docx
5
  import numpy as np
6
- from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
 
7
 
8
- # 1. Load a pretrained Sentence Transformer model locally
9
- model = SentenceTransformer("all-MiniLM-L6-v2")
10
 
11
 
12
- # ---- Text extraction ----
13
- def extract_text(file_path, filename):
14
- if filename.endswith(".pdf"):
 
 
 
15
  text = ""
16
  with fitz.open(file_path) as doc:
17
  for page in doc:
18
- text += page.get_text("text") + "\n"
19
  return text
20
- elif filename.endswith(".docx"):
21
- docf = docx.Document(file_path)
22
- return "\n".join(p.text for p in docf.paragraphs)
 
 
23
  return ""
24
 
25
 
26
- # ---- Local embedding helper ----
27
- def get_embedding(text):
28
- # Use the locally loaded model to generate embeddings
29
- embedding = model.encode(text)
30
- return np.array(embedding)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # ---- CV ranking ----
34
  def rank_cvs(job_description, files):
35
- if not job_description or not files:
36
- return "⚠️ Please upload CVs and provide a job description."
37
-
38
- job_emb = get_embedding(job_description)
39
- scores, names = [], []
40
-
41
- for file_path in files:
42
- # Get filename from file path
43
- filename = os.path.basename(file_path)
44
-
45
- text = extract_text(file_path, filename)
46
- if not text.strip():
 
 
 
 
 
 
 
47
  continue
48
- cv_emb = get_embedding(text[:4000]) # limit text length
49
- sim = np.dot(job_emb, cv_emb) / (
50
- np.linalg.norm(job_emb) * np.linalg.norm(cv_emb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  )
52
- scores.append(sim)
53
- names.append(filename)
54
 
55
- top = sorted(zip(names, scores), key=lambda x: x[1], reverse=True)[:10]
56
- return "\n\n".join(
57
- [f"**{i+1}. {n}** — Similarity: `{s:.3f}`" for i, (n, s) in enumerate(top)]
58
- )
59
 
 
 
 
60
 
61
- # ---- Gradio UI ----
62
  demo = gr.Interface(
63
  fn=rank_cvs,
64
  inputs=[
65
- gr.Textbox(label="💼 Job Description", lines=5),
66
- gr.File(label="📁 Upload CVs (PDF/DOCX)", file_count="multiple", type="filepath"),
67
  ],
68
  outputs=gr.Markdown(),
69
- title="📄 AI CV Ranker (Local Model)",
70
- description="Ranks uploaded CVs based on job relevance using local SentenceTransformer model.",
71
  )
72
 
73
  if __name__ == "__main__":
74
- demo.launch()
 
1
  import os
2
+ import fitz
 
3
  import docx
4
  import numpy as np
5
+ import gradio as gr
6
+ import re
7
+ from sentence_transformers import SentenceTransformer, CrossEncoder
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ # -----------------------
11
+ # MODELS (better choices)
12
+ # -----------------------
13
 
14
+ bi_encoder = SentenceTransformer("BAAI/bge-base-en") # better embeddings
15
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
16
 
17
 
18
+ # -----------------------
19
+ # TEXT EXTRACTION
20
+ # -----------------------
21
+
22
+ def extract_text(file_path):
23
+ if file_path.endswith(".pdf"):
24
  text = ""
25
  with fitz.open(file_path) as doc:
26
  for page in doc:
27
+ text += page.get_text()
28
  return text
29
+
30
+ if file_path.endswith(".docx"):
31
+ d = docx.Document(file_path)
32
+ return "\n".join(p.text for p in d.paragraphs)
33
+
34
  return ""
35
 
36
 
37
+ # -----------------------
38
+ # CLEANING
39
+ # -----------------------
40
+
41
+ def clean_text(t):
42
+ t = t.lower()
43
+ t = re.sub(r"\s+", " ", t)
44
+ return t
45
+
46
+
47
+ # -----------------------
48
+ # CHUNK EMBEDDINGS (IMPORTANT)
49
+ # -----------------------
50
+
51
+ def embed_chunks(text, size=400):
52
+ chunks = [text[i:i+size] for i in range(0, len(text), size)]
53
+ embs = bi_encoder.encode(chunks)
54
+ return np.mean(embs, axis=0)
55
+
56
 
57
+ # -----------------------
58
+ # SKILL MATCHING
59
+ # -----------------------
60
+
61
+ SKILLS = [
62
+ "python","java","sql","aws","docker","kubernetes",
63
+ "machine learning","pytorch","tensorflow",
64
+ "react","node","linux"
65
+ ]
66
+
67
+ def skill_score(job, cv):
68
+ job_skills = [s for s in SKILLS if s in job]
69
+ if not job_skills:
70
+ return 0
71
+ matched = sum(s in cv for s in job_skills)
72
+ return matched / len(job_skills)
73
+
74
+
75
+ # -----------------------
76
+ # EXPERIENCE EXTRACTION (simple rule)
77
+ # -----------------------
78
+
79
+ def extract_years(text):
80
+ nums = re.findall(r"(\d+)\+?\s+years?", text)
81
+ return max([int(n) for n in nums], default=0)
82
+
83
+
84
+ # -----------------------
85
+ # MAIN RANKING
86
+ # -----------------------
87
 
 
88
  def rank_cvs(job_description, files):
89
+
90
+ if not files:
91
+ return "Upload CVs."
92
+
93
+ job_description = clean_text(job_description)
94
+
95
+ # embed job once
96
+ job_emb = embed_chunks(job_description)
97
+
98
+ candidates = []
99
+
100
+ # ----------------
101
+ # Stage 1: Fast retrieval
102
+ # ----------------
103
+ for f in files:
104
+ name = os.path.basename(f)
105
+
106
+ text = clean_text(extract_text(f))
107
+ if not text:
108
  continue
109
+
110
+ emb = embed_chunks(text)
111
+
112
+ sim = cosine_similarity([job_emb], [emb])[0][0]
113
+
114
+ candidates.append({
115
+ "name": name,
116
+ "text": text,
117
+ "sim": sim
118
+ })
119
+
120
+ # shortlist top 20
121
+ candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20]
122
+
123
+
124
+ # ----------------
125
+ # Stage 2: Cross-encoder rerank (accuracy boost)
126
+ # ----------------
127
+ pairs = [[job_description, c["text"][:3000]] for c in candidates]
128
+ ce_scores = cross_encoder.predict(pairs)
129
+
130
+ for c, ce in zip(candidates, ce_scores):
131
+ c["ce"] = ce
132
+
133
+
134
+ # ----------------
135
+ # Stage 3: Business logic scoring
136
+ # ----------------
137
+ for c in candidates:
138
+
139
+ s_score = skill_score(job_description, c["text"])
140
+ years = extract_years(c["text"])
141
+
142
+ final = (
143
+ 0.5 * c["ce"] + # semantic accuracy
144
+ 0.3 * s_score + # skills
145
+ 0.2 * min(years/10,1) # experience
146
+ )
147
+
148
+ c["final"] = final
149
+
150
+
151
+ # ----------------
152
+ # sort final
153
+ # ----------------
154
+ candidates = sorted(candidates, key=lambda x: x["final"], reverse=True)
155
+
156
+
157
+ # ----------------
158
+ # Explainable output
159
+ # ----------------
160
+ output = ""
161
+ for i, c in enumerate(candidates[:10]):
162
+ output += (
163
+ f"### {i+1}. {c['name']}\n"
164
+ f"- Final Score: {c['final']:.3f}\n"
165
+ f"- Semantic: {c['ce']:.3f}\n"
166
+ f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n"
167
+ f"- Years: {extract_years(c['text'])}\n\n"
168
  )
 
 
169
 
170
+ return output
171
+
 
 
172
 
173
+ # -----------------------
174
+ # UI
175
+ # -----------------------
176
 
 
177
  demo = gr.Interface(
178
  fn=rank_cvs,
179
  inputs=[
180
+ gr.Textbox(label="Job Description", lines=6),
181
+ gr.File(file_count="multiple", type="filepath")
182
  ],
183
  outputs=gr.Markdown(),
184
+ title="Production CV Ranker"
 
185
  )
186
 
187
  if __name__ == "__main__":
188
+ demo.launch()