ketannnn commited on
Commit
bdaeeeb
·
1 Parent(s): 1ca9ba2

feat: add embedding singleton and batch ingest Celery worker

Browse files
backend/src/ml/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # ml package
backend/src/ml/embedder.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from ..config import get_settings
5
+
6
+ _model: SentenceTransformer | None = None
7
+ INSTRUCTION = "Represent this candidate profile for matching job descriptions: "
8
+
9
+
10
+ def _get_model() -> SentenceTransformer:
11
+ global _model
12
+ if _model is None:
13
+ settings = get_settings()
14
+ _model = SentenceTransformer(settings.embedding_model)
15
+ return _model
16
+
17
+
18
+ def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray:
19
+ model = _get_model()
20
+ if instruction:
21
+ texts = [INSTRUCTION + t for t in texts]
22
+ embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
23
+ return embeddings.astype(np.float32)
24
+
25
+
26
+ def embed_query(text: str) -> np.ndarray:
27
+ model = _get_model()
28
+ query = "Represent this job description for retrieving matching candidates: " + text
29
+ emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
30
+ return emb[0].astype(np.float32)
31
+
32
+
33
+ def compute_text_hash(text: str) -> str:
34
+ return hashlib.sha256(text.encode()).hexdigest()[:32]
backend/src/ml/feature_builder.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import math
3
+ from typing import Any
4
+
5
+
6
+ SENIORITY_MAP = {
7
+ "intern": 0, "trainee": 0, "junior": 1, "associate": 1,
8
+ "mid": 2, "senior": 3, "lead": 4, "staff": 4,
9
+ "principal": 5, "architect": 5, "manager": 4, "director": 6, "vp": 7, "cto": 8,
10
+ }
11
+
12
+ TIER1_EDU = {"iit", "iim", "nit", "bits", "iiit", "mit", "stanford", "cmu", "berkeley"}
13
+
14
+
15
+ def build_candidate_text(candidate: dict[str, Any]) -> str:
16
+ parts = []
17
+ if candidate.get("parsed_summary"):
18
+ parts.append(candidate["parsed_summary"])
19
+ if candidate.get("parsed_skills"):
20
+ parts.append(f"Skills: {candidate['parsed_skills']}")
21
+ langs = candidate.get("programming_languages") or []
22
+ if langs:
23
+ parts.append(f"Languages: {', '.join(langs)}")
24
+ frameworks = (candidate.get("backend_frameworks") or []) + (candidate.get("frontend_technologies") or [])
25
+ if frameworks:
26
+ parts.append(f"Frameworks: {', '.join(frameworks)}")
27
+ work_exp = candidate.get("parsed_work_experience") or []
28
+ for we in work_exp[:3]:
29
+ if isinstance(we, dict):
30
+ desc = we.get("description") or we.get("role") or ""
31
+ company = we.get("company") or ""
32
+ if desc or company:
33
+ parts.append(f"{company}: {desc}".strip(": "))
34
+ if candidate.get("most_recent_company_description"):
35
+ parts.append(candidate["most_recent_company_description"])
36
+ return " | ".join(filter(None, parts))
37
+
38
+
39
+ def _parse_duration_months(entry: dict) -> float:
40
+ duration = entry.get("duration") or entry.get("tenure") or ""
41
+ if not duration:
42
+ return 12.0
43
+ years = re.findall(r"(\d+\.?\d*)\s*(?:year|yr)", duration, re.IGNORECASE)
44
+ months = re.findall(r"(\d+\.?\d*)\s*(?:month|mo)", duration, re.IGNORECASE)
45
+ total = sum(float(y) * 12 for y in years) + sum(float(m) for m in months)
46
+ return total if total > 0 else 12.0
47
+
48
+
49
+ def _extract_seniority(title: str) -> int:
50
+ title_lower = title.lower()
51
+ for key, val in sorted(SENIORITY_MAP.items(), key=lambda x: -x[1]):
52
+ if key in title_lower:
53
+ return val
54
+ return 2
55
+
56
+
57
+ def compute_growth_velocity(work_experience: list[dict], is_funded: bool = False) -> float:
58
+ if not work_experience or len(work_experience) < 2:
59
+ base = 0.6 if is_funded else 0.5
60
+ return base
61
+
62
+ entries = sorted(work_experience, key=lambda x: x.get("start_date", "") or "")
63
+ seniority_levels = []
64
+ total_months = 0.0
65
+
66
+ for entry in entries:
67
+ if not isinstance(entry, dict):
68
+ continue
69
+ title = entry.get("title") or entry.get("role") or ""
70
+ seniority_levels.append(_extract_seniority(title))
71
+ total_months += _parse_duration_months(entry)
72
+
73
+ if len(seniority_levels) < 2:
74
+ return 0.5
75
+
76
+ seniority_gain = seniority_levels[-1] - seniority_levels[0]
77
+ years_elapsed = max(total_months / 12, 0.5)
78
+ velocity = seniority_gain / years_elapsed
79
+
80
+ normalized = min(max((velocity + 1) / 3, 0.0), 1.0)
81
+
82
+ if is_funded:
83
+ normalized = min(normalized + 0.1, 1.0)
84
+
85
+ return round(normalized, 4)
86
+
87
+
88
+ def skill_jaccard(jd_skills: list[str], candidate_skills: list[str]) -> float:
89
+ if not jd_skills:
90
+ return 0.5
91
+ jd_set = {s.lower().strip() for s in jd_skills if s}
92
+ cand_set = {s.lower().strip() for s in candidate_skills if s}
93
+ if not cand_set:
94
+ return 0.0
95
+ intersection = jd_set & cand_set
96
+ union = jd_set | cand_set
97
+ return len(intersection) / len(union) if union else 0.0
98
+
99
+
100
+ def yoe_match(min_yoe: float | None, max_yoe: float | None, candidate_yoe: float | None) -> float:
101
+ if candidate_yoe is None:
102
+ return 0.5
103
+ if min_yoe is None and max_yoe is None:
104
+ return 0.7
105
+ candidate_yoe = float(candidate_yoe)
106
+ if min_yoe is not None and candidate_yoe < min_yoe:
107
+ gap = min_yoe - candidate_yoe
108
+ return max(0.0, 1.0 - gap * 0.2)
109
+ if max_yoe is not None and candidate_yoe > max_yoe + 3:
110
+ return 0.7
111
+ return 1.0
112
+
113
+
114
+ def company_quality_signal(candidate: dict[str, Any]) -> float:
115
+ score = 0.5
116
+ if candidate.get("most_recent_company_is_product_company"):
117
+ score += 0.2
118
+ if candidate.get("most_recent_company_is_funded"):
119
+ score += 0.15
120
+ funding = candidate.get("most_recent_company_total_funding") or 0
121
+ if funding > 10_000_000:
122
+ score += 0.1
123
+ if funding > 100_000_000:
124
+ score += 0.05
125
+ return min(score, 1.0)
126
+
127
+
128
+ def education_match(candidate: dict[str, Any]) -> float:
129
+ degree = (candidate.get("degree") or "").lower()
130
+ status = (candidate.get("education_status") or "").lower()
131
+ score = 0.5
132
+ if "bachelor" in degree or "b.tech" in degree or "be " in degree:
133
+ score = 0.6
134
+ if "master" in degree or "m.tech" in degree or "mba" in degree:
135
+ score = 0.8
136
+ if "phd" in degree or "doctorate" in degree:
137
+ score = 0.9
138
+ for uni in TIER1_EDU:
139
+ if uni in degree or uni in status:
140
+ score = min(score + 0.15, 1.0)
141
+ break
142
+ return score
143
+
144
+
145
+ def compute_jd_quality(raw_text: str, parsed: dict[str, Any], candidate_count: int = 0) -> dict[str, Any]:
146
+ required_skills = parsed.get("required_skills") or []
147
+ skill_count = len(required_skills)
148
+
149
+ vagueness_score = 1.0
150
+ if skill_count >= 5:
151
+ vagueness_score = 0.2
152
+ elif skill_count >= 3:
153
+ vagueness_score = 0.5
154
+ elif skill_count >= 1:
155
+ vagueness_score = 0.75
156
+
157
+ word_count = len(raw_text.split())
158
+ if word_count < 50:
159
+ vagueness_score = min(vagueness_score + 0.3, 1.0)
160
+
161
+ contradictions = []
162
+ min_yoe = parsed.get("min_yoe")
163
+ engineer_type = (parsed.get("engineer_type") or "").lower()
164
+ if min_yoe and min_yoe >= 5 and "junior" in raw_text.lower():
165
+ contradictions.append("Requires 5+ YOE but mentions junior role")
166
+ if min_yoe and min_yoe <= 1 and "senior" in raw_text.lower():
167
+ contradictions.append("Entry-level YOE but expects senior candidate")
168
+
169
+ breadth_score = 0.0
170
+ if candidate_count > 0 and skill_count < 2:
171
+ breadth_score = 0.9
172
+
173
+ warnings = []
174
+ if vagueness_score > 0.6:
175
+ warnings.append("JD is too vague — add more specific skill requirements for better match quality")
176
+ if contradictions:
177
+ warnings.append(f"Contradictions detected: {'; '.join(contradictions)}")
178
+ if breadth_score > 0.7:
179
+ warnings.append("Requirements are too broad — almost all candidates will match")
180
+
181
+ overall = "good"
182
+ if vagueness_score > 0.6 or contradictions or breadth_score > 0.7:
183
+ overall = "poor"
184
+ elif vagueness_score > 0.35:
185
+ overall = "fair"
186
+
187
+ return {
188
+ "overall": overall,
189
+ "vagueness_score": round(vagueness_score, 3),
190
+ "breadth_score": round(breadth_score, 3),
191
+ "skill_count": skill_count,
192
+ "contradictions": contradictions,
193
+ "warnings": warnings,
194
+ }
195
+
196
+
197
+ def parse_jd_requirements(raw_text: str) -> dict[str, Any]:
198
+ skills = []
199
+ skill_patterns = [
200
+ r"\b(python|javascript|typescript|java|go|golang|rust|c\+\+|ruby|php|scala|kotlin|swift)\b",
201
+ r"\b(react|angular|vue|nextjs|fastapi|django|flask|express|springboot|rails)\b",
202
+ r"\b(postgresql|mysql|mongodb|redis|elasticsearch|kafka|rabbitmq|cassandra)\b",
203
+ r"\b(aws|gcp|azure|docker|kubernetes|terraform|ansible|ci\/cd|devops)\b",
204
+ r"\b(machine learning|deep learning|nlp|llm|rag|vector|embedding|pytorch|tensorflow)\b",
205
+ r"\b(sql|nosql|graphql|rest|grpc|microservices|api)\b",
206
+ ]
207
+ for pattern in skill_patterns:
208
+ found = re.findall(pattern, raw_text, re.IGNORECASE)
209
+ skills.extend([f.lower() for f in found])
210
+ skills = list(dict.fromkeys(skills))
211
+
212
+ yoe_match_obj = re.search(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)", raw_text, re.IGNORECASE)
213
+ min_yoe = float(yoe_match_obj.group(1)) if yoe_match_obj else None
214
+
215
+ role_type = None
216
+ if re.search(r"\bfull.?time\b", raw_text, re.IGNORECASE):
217
+ role_type = "full-time"
218
+ elif re.search(r"\bcontract\b", raw_text, re.IGNORECASE):
219
+ role_type = "contract"
220
+ elif re.search(r"\bpart.?time\b", raw_text, re.IGNORECASE):
221
+ role_type = "part-time"
222
+
223
+ engineer_type = None
224
+ if re.search(r"\bbackend\b", raw_text, re.IGNORECASE):
225
+ engineer_type = "backend"
226
+ elif re.search(r"\bfrontend\b", raw_text, re.IGNORECASE):
227
+ engineer_type = "frontend"
228
+ elif re.search(r"\bfullstack\b|full.?stack\b", raw_text, re.IGNORECASE):
229
+ engineer_type = "fullstack"
230
+ elif re.search(r"\bai\s+engineer|ml\s+engineer|machine\s+learning", raw_text, re.IGNORECASE):
231
+ engineer_type = "ai"
232
+ elif re.search(r"\bdata\s+engineer\b", raw_text, re.IGNORECASE):
233
+ engineer_type = "data"
234
+
235
+ remote_allowed = bool(re.search(r"\bremote\b", raw_text, re.IGNORECASE))
236
+
237
+ location_match = re.search(
238
+ r"\b(bangalore|mumbai|delhi|hyderabad|chennai|pune|kolkata|remote|india|us|usa|uk|london|new york|san francisco)\b",
239
+ raw_text, re.IGNORECASE
240
+ )
241
+ location = location_match.group(0).title() if location_match else None
242
+
243
+ return {
244
+ "required_skills": skills,
245
+ "min_yoe": min_yoe,
246
+ "max_yoe": None,
247
+ "role_type": role_type,
248
+ "engineer_type": engineer_type,
249
+ "remote_allowed": remote_allowed,
250
+ "location": location,
251
+ }
backend/src/ml/reranker.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from FlagEmbedding import FlagReranker
2
+ from ..config import get_settings
3
+
4
+ _reranker: FlagReranker | None = None
5
+
6
+
7
+ def _get_reranker() -> FlagReranker:
8
+ global _reranker
9
+ if _reranker is None:
10
+ settings = get_settings()
11
+ _reranker = FlagReranker(settings.reranker_model, use_fp16=False)
12
+ return _reranker
13
+
14
+
15
+ def rerank(query: str, passages: list[str]) -> list[float]:
16
+ if not passages:
17
+ return []
18
+ reranker = _get_reranker()
19
+ pairs = [[query, p] for p in passages]
20
+ scores = reranker.compute_score(pairs, normalize=True)
21
+ if isinstance(scores, float):
22
+ scores = [scores]
23
+ return [float(s) for s in scores]
backend/src/workers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # workers package
backend/src/workers/celery_app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from celery import Celery
3
+ from ..config import get_settings
4
+
5
+ settings = get_settings()
6
+
7
+ celery_app = Celery(
8
+ "talentpulse",
9
+ broker=settings.redis_url,
10
+ backend=settings.redis_url,
11
+ include=["src.workers.ingest"],
12
+ )
13
+
14
+ celery_app.conf.update(
15
+ task_serializer="json",
16
+ accept_content=["json"],
17
+ result_serializer="json",
18
+ timezone="UTC",
19
+ enable_utc=True,
20
+ task_track_started=True,
21
+ result_expires=3600,
22
+ worker_prefetch_multiplier=1,
23
+ task_acks_late=True,
24
+ )