Siggmoid Cursor commited on
Commit
cea745e
Β·
1 Parent(s): 51e25cb

Improve semantic score with chunk matching, MPNet, and calibration

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. utilities/keyword_match.py +109 -17
Dockerfile CHANGED
@@ -23,7 +23,7 @@ COPY --chown=user requirements.txt requirements.txt
23
  RUN pip install --no-cache-dir -r requirements.txt
24
 
25
  # Pre-download embedding model at build time (faster Space cold start)
26
- RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')\""
27
 
28
  COPY --chown=user . /app
29
 
 
23
  RUN pip install --no-cache-dir -r requirements.txt
24
 
25
  # Pre-download embedding model at build time (faster Space cold start)
26
+ RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')\""
27
 
28
  COPY --chown=user . /app
29
 
utilities/keyword_match.py CHANGED
@@ -1,4 +1,7 @@
 
1
  import re
 
 
2
  from sklearn.metrics.pairwise import cosine_similarity
3
  from sentence_transformers import SentenceTransformer
4
  from utilities.skills import (
@@ -8,7 +11,13 @@ from utilities.skills import (
8
  clean_text,
9
  )
10
 
11
- model = SentenceTransformer("all-MiniLM-L6-v2")
 
 
 
 
 
 
12
 
13
 
14
  # ---------------------------------------------------------------------------
@@ -35,20 +44,75 @@ STOP_WORDS: set = {
35
  # Text utilities
36
  # ---------------------------------------------------------------------------
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def extract_skill_sentences(text: str) -> str:
39
  """
40
- Keep only sentences / bullet points that contain at least one
41
- known tech skill. Falls back to the full text if nothing matches
42
- (prevents a zero-length embedding).
43
  """
44
- segments = re.split(r'[.\n;]', text)
45
- cleaned_text = clean_text(text)
46
  relevant = []
47
  for seg in segments:
48
  seg_clean = clean_text(seg)
 
 
49
  if any(skill in seg_clean for skill in SKILLS_SORTED_BY_LENGTH):
50
  relevant.append(seg_clean)
51
- return " ".join(relevant) if relevant else cleaned_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
  def remove_stop_words(text: str) -> set:
@@ -92,19 +156,47 @@ def keyword_match_score(resume_text: str, jd_text: str) -> float:
92
 
93
  def semantic_match_score(resume_text: str, jd_text: str) -> float:
94
  """
95
- Skill-focused semantic similarity.
96
 
97
- Strategy:
98
- - Filter both texts down to skill-relevant sentences before encoding.
99
- - This focuses the embedding on technical content and reduces noise
100
- from generic filler language ("we are a fast-paced team...").
 
 
101
  """
102
- resume_focused = extract_skill_sentences(resume_text)
103
- jd_focused = extract_skill_sentences(jd_text)
 
 
 
104
 
105
- embeddings = model.encode([resume_focused, jd_focused])
106
- score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
107
- return round(float(score) * 100, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  def experience_level_penalty(resume_text: str, jd_text: str) -> float:
 
1
+ import os
2
  import re
3
+
4
+ import numpy as np
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from sentence_transformers import SentenceTransformer
7
  from utilities.skills import (
 
11
  clean_text,
12
  )
13
 
14
+ # MPNet is stronger than MiniLM for long-form resume/JD similarity.
15
+ SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "all-mpnet-base-v2")
16
+ MAX_DOC_CHARS = 8000
17
+ MAX_CHUNKS = 24
18
+ MIN_CHUNK_CHARS = 35
19
+
20
+ model = SentenceTransformer(SEMANTIC_MODEL_ID)
21
 
22
 
23
  # ---------------------------------------------------------------------------
 
44
  # Text utilities
45
  # ---------------------------------------------------------------------------
46
 
47
+ def truncate_text(text: str, max_chars: int = MAX_DOC_CHARS) -> str:
48
+ if len(text) <= max_chars:
49
+ return text
50
+ return text[:max_chars].rsplit(" ", 1)[0]
51
+
52
+
53
+ def split_into_chunks(text: str, max_chunks: int = MAX_CHUNKS) -> list[str]:
54
+ """Split resume/JD into comparable segments (bullets, lines, sentences)."""
55
+ if not text:
56
+ return []
57
+
58
+ parts = re.split(r"[\n\r]+|(?<=[.!?])\s+", text)
59
+ chunks = [p.strip() for p in parts if len(p.strip()) >= MIN_CHUNK_CHARS]
60
+
61
+ if not chunks and text.strip():
62
+ words = text.split()
63
+ window = 55
64
+ for i in range(0, len(words), window):
65
+ piece = " ".join(words[i : i + window])
66
+ if len(piece) >= MIN_CHUNK_CHARS:
67
+ chunks.append(piece)
68
+
69
+ return chunks[:max_chunks]
70
+
71
+
72
  def extract_skill_sentences(text: str) -> str:
73
  """
74
+ Skill-heavy lines only β€” used as a secondary signal, not the main embedding.
 
 
75
  """
76
+ segments = re.split(r"[\n\r.;]+", text)
 
77
  relevant = []
78
  for seg in segments:
79
  seg_clean = clean_text(seg)
80
+ if len(seg_clean) < MIN_CHUNK_CHARS:
81
+ continue
82
  if any(skill in seg_clean for skill in SKILLS_SORTED_BY_LENGTH):
83
  relevant.append(seg_clean)
84
+ return " ".join(relevant) if relevant else clean_text(text)
85
+
86
+
87
+ def calibrate_semantic_score(cosine: float) -> float:
88
+ """
89
+ Map raw cosine similarity to a 0–100 ATS-style scale.
90
+
91
+ MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
92
+ not 0.9+, so raw cosine understates good matches without calibration.
93
+ """
94
+ cosine = float(np.clip(cosine, 0.0, 1.0))
95
+ low, high = 0.32, 0.78
96
+ scaled = (cosine - low) / (high - low) * 100.0
97
+ return round(float(np.clip(scaled, 0.0, 100.0)), 2)
98
+
99
+
100
+ def _pairwise_cosine(a: np.ndarray, b: np.ndarray) -> float:
101
+ return float(cosine_similarity([a], [b])[0][0])
102
+
103
+
104
+ def _chunk_bidirectional_score(resume_chunks: list[str], jd_chunks: list[str]) -> float:
105
+ """How well JD requirements are covered by resume (and vice versa)."""
106
+ if not resume_chunks or not jd_chunks:
107
+ return 0.0
108
+
109
+ resume_emb = model.encode(resume_chunks, convert_to_numpy=True)
110
+ jd_emb = model.encode(jd_chunks, convert_to_numpy=True)
111
+ sim_matrix = cosine_similarity(resume_emb, jd_emb)
112
+
113
+ jd_coverage = float(sim_matrix.max(axis=0).mean())
114
+ resume_coverage = float(sim_matrix.max(axis=1).mean())
115
+ return (jd_coverage + resume_coverage) / 2.0
116
 
117
 
118
  def remove_stop_words(text: str) -> set:
 
156
 
157
  def semantic_match_score(resume_text: str, jd_text: str) -> float:
158
  """
159
+ Semantic similarity tuned for resume ↔ JD alignment.
160
 
161
+ Combines:
162
+ 1. Full-document embedding (overall theme)
163
+ 2. Chunk-level bi-directional match (handles length mismatch)
164
+ 3. Skill-heavy lines (secondary boost)
165
+
166
+ Raw cosine is calibrated to a more intuitive 0–100 scale.
167
  """
168
+ resume_clean = clean_text(resume_text)
169
+ jd_clean = clean_text(jd_text)
170
+
171
+ if not resume_clean or not jd_clean:
172
+ return 0.0
173
 
174
+ resume_doc = truncate_text(resume_clean)
175
+ jd_doc = truncate_text(jd_clean)
176
+
177
+ doc_emb = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
178
+ full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
179
+
180
+ resume_chunks = split_into_chunks(resume_clean)
181
+ jd_chunks = split_into_chunks(jd_clean)
182
+ if len(resume_chunks) > 1 and len(jd_chunks) > 1:
183
+ chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
184
+ else:
185
+ chunk_sim = full_sim
186
+
187
+ resume_skills = extract_skill_sentences(resume_clean)
188
+ jd_skills = extract_skill_sentences(jd_clean)
189
+ if resume_skills and jd_skills:
190
+ skill_emb = model.encode(
191
+ [truncate_text(resume_skills, 4000), truncate_text(jd_skills, 4000)],
192
+ convert_to_numpy=True,
193
+ )
194
+ skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
195
+ else:
196
+ skill_sim = full_sim
197
+
198
+ raw_cosine = 0.30 * full_sim + 0.55 * chunk_sim + 0.15 * skill_sim
199
+ return calibrate_semantic_score(raw_cosine)
200
 
201
 
202
  def experience_level_penalty(resume_text: str, jd_text: str) -> float: