ketannnn commited on
Commit
4427789
·
1 Parent(s): 3330b0f

feat: implement Stage 1 retriever - ANN search + structured weighted scorer

Browse files
backend/src/matching/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # matching package
backend/src/matching/stage1.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from qdrant_client import QdrantClient
3
+ from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+ from sqlalchemy import select, or_
6
+
7
+ from ..config import get_settings
8
+ from ..models.candidate import Candidate
9
+ from ..ml.embedder import embed_query
10
+ from ..ml.feature_builder import (
11
+ skill_jaccard,
12
+ yoe_match,
13
+ company_quality_signal,
14
+ education_match,
15
+ )
16
+
17
+
18
+ DEFAULT_WEIGHTS = {
19
+ "semantic": 0.20,
20
+ "skill": 0.35,
21
+ "yoe": 0.15,
22
+ "company": 0.10,
23
+ "growth": 0.10,
24
+ "education": 0.10,
25
+ }
26
+
27
+
28
+ def _build_qdrant_filter(jd: dict) -> Filter | None:
29
+ conditions = []
30
+ if jd.get("role_type"):
31
+ conditions.append(
32
+ FieldCondition(key="role_type", match=MatchValue(value=jd["role_type"]))
33
+ )
34
+ if jd.get("min_yoe") is not None:
35
+ conditions.append(
36
+ FieldCondition(key="years_of_experience", range=Range(gte=max(0, jd["min_yoe"] - 2)))
37
+ )
38
+ if not conditions:
39
+ return None
40
+ return Filter(must=conditions)
41
+
42
+
43
+ async def stage1_retrieve(
44
+ jd: dict,
45
+ db: AsyncSession,
46
+ qdrant: QdrantClient,
47
+ top_k: int = 200,
48
+ weights: dict | None = None,
49
+ ) -> list[dict[str, Any]]:
50
+ settings = get_settings()
51
+ w = {**DEFAULT_WEIGHTS, **(weights or {})}
52
+
53
+ jd_text = f"{jd.get('title', '')} {jd.get('raw_text', '')}"
54
+ query_vector = embed_query(jd_text)
55
+
56
+ qdrant_filter = _build_qdrant_filter(jd)
57
+ search_results = qdrant.search(
58
+ collection_name=settings.collection_name,
59
+ query_vector=query_vector.tolist(),
60
+ query_filter=qdrant_filter,
61
+ limit=top_k,
62
+ with_payload=True,
63
+ )
64
+
65
+ if not search_results:
66
+ return []
67
+
68
+ qdrant_ids = [r.id for r in search_results]
69
+ score_by_qdrant_id = {r.id: float(r.score) for r in search_results}
70
+
71
+ result = await db.execute(
72
+ select(Candidate).where(Candidate.qdrant_id.in_(qdrant_ids))
73
+ )
74
+ candidates = {c.qdrant_id: c for c in result.scalars().all()}
75
+
76
+ jd_skills = jd.get("required_skills") or []
77
+ min_yoe = jd.get("min_yoe")
78
+ max_yoe = jd.get("max_yoe")
79
+
80
+ scored = []
81
+ for qid in qdrant_ids:
82
+ cand = candidates.get(qid)
83
+ if cand is None:
84
+ continue
85
+
86
+ cosine_sim = score_by_qdrant_id[qid]
87
+
88
+ all_cand_skills = (
89
+ (cand.programming_languages or [])
90
+ + (cand.backend_frameworks or [])
91
+ + (cand.frontend_technologies or [])
92
+ )
93
+ if cand.parsed_skills:
94
+ all_cand_skills.extend([s.strip() for s in cand.parsed_skills.split(",") if s.strip()])
95
+
96
+ components = {
97
+ "semantic": cosine_sim,
98
+ "skill": skill_jaccard(jd_skills, all_cand_skills),
99
+ "yoe": yoe_match(min_yoe, max_yoe, cand.years_of_experience),
100
+ "company": company_quality_signal(
101
+ {
102
+ "most_recent_company_is_funded": cand.most_recent_company_is_funded,
103
+ "most_recent_company_is_product_company": cand.most_recent_company_is_product_company,
104
+ "most_recent_company_total_funding": cand.most_recent_company_total_funding,
105
+ }
106
+ ),
107
+ "growth": float(cand.growth_velocity or 0.5),
108
+ "education": education_match(
109
+ {
110
+ "degree": cand.degree,
111
+ "education_status": cand.education_status,
112
+ }
113
+ ),
114
+ }
115
+
116
+ total = sum(w.get(k, 0) * v for k, v in components.items())
117
+
118
+ scored.append(
119
+ {
120
+ "candidate_id": str(cand.id),
121
+ "qdrant_id": qid,
122
+ "name": cand.name,
123
+ "email": cand.email,
124
+ "role_type": cand.role_type,
125
+ "engineer_type": cand.engineer_type,
126
+ "years_of_experience": cand.years_of_experience,
127
+ "most_recent_company": cand.most_recent_company,
128
+ "parsed_summary": cand.parsed_summary,
129
+ "parsed_skills": cand.parsed_skills,
130
+ "parsed_work_experience": cand.parsed_work_experience or [],
131
+ "programming_languages": cand.programming_languages or [],
132
+ "backend_frameworks": cand.backend_frameworks or [],
133
+ "frontend_technologies": cand.frontend_technologies or [],
134
+ "growth_velocity": cand.growth_velocity,
135
+ "stage1_score": round(total, 4),
136
+ "component_scores": {k: round(v, 4) for k, v in components.items()},
137
+ }
138
+ )
139
+
140
+ scored.sort(key=lambda x: x["stage1_score"], reverse=True)
141
+ return scored[:50]