Spaces:

ketannnn
/

coderound

Sleeping

App Files Files Community

ketannnn commited on Apr 12

Commit

4427789

1 Parent(s): 3330b0f

feat: implement Stage 1 retriever - ANN search + structured weighted scorer

Browse files

Files changed (2) hide show

backend/src/matching/__init__.py +1 -0
backend/src/matching/stage1.py +141 -0

backend/src/matching/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # matching package

backend/src/matching/stage1.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from typing import Any
+from qdrant_client import QdrantClient
+from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, or_
+from ..config import get_settings
+from ..models.candidate import Candidate
+from ..ml.embedder import embed_query
+from ..ml.feature_builder import (
+    skill_jaccard,
+    yoe_match,
+    company_quality_signal,
+    education_match,
+)
+DEFAULT_WEIGHTS = {
+    "semantic": 0.20,
+    "skill": 0.35,
+    "yoe": 0.15,
+    "company": 0.10,
+    "growth": 0.10,
+    "education": 0.10,
+}
+def _build_qdrant_filter(jd: dict) -> Filter | None:
+    conditions = []
+    if jd.get("role_type"):
+        conditions.append(
+            FieldCondition(key="role_type", match=MatchValue(value=jd["role_type"]))
+        )
+    if jd.get("min_yoe") is not None:
+        conditions.append(
+            FieldCondition(key="years_of_experience", range=Range(gte=max(0, jd["min_yoe"] - 2)))
+        )
+    if not conditions:
+        return None
+    return Filter(must=conditions)
+async def stage1_retrieve(
+    jd: dict,
+    db: AsyncSession,
+    qdrant: QdrantClient,
+    top_k: int = 200,
+    weights: dict | None = None,
+) -> list[dict[str, Any]]:
+    settings = get_settings()
+    w = {**DEFAULT_WEIGHTS, **(weights or {})}
+    jd_text = f"{jd.get('title', '')} {jd.get('raw_text', '')}"
+    query_vector = embed_query(jd_text)
+    qdrant_filter = _build_qdrant_filter(jd)
+    search_results = qdrant.search(
+        collection_name=settings.collection_name,
+        query_vector=query_vector.tolist(),
+        query_filter=qdrant_filter,
+        limit=top_k,
+        with_payload=True,
+    )
+    if not search_results:
+        return []
+    qdrant_ids = [r.id for r in search_results]
+    score_by_qdrant_id = {r.id: float(r.score) for r in search_results}
+    result = await db.execute(
+        select(Candidate).where(Candidate.qdrant_id.in_(qdrant_ids))
+    )
+    candidates = {c.qdrant_id: c for c in result.scalars().all()}
+    jd_skills = jd.get("required_skills") or []
+    min_yoe = jd.get("min_yoe")
+    max_yoe = jd.get("max_yoe")
+    scored = []
+    for qid in qdrant_ids:
+        cand = candidates.get(qid)
+        if cand is None:
+            continue
+        cosine_sim = score_by_qdrant_id[qid]
+        all_cand_skills = (
+            (cand.programming_languages or [])
+            + (cand.backend_frameworks or [])
+            + (cand.frontend_technologies or [])
+        )
+        if cand.parsed_skills:
+            all_cand_skills.extend([s.strip() for s in cand.parsed_skills.split(",") if s.strip()])
+        components = {
+            "semantic": cosine_sim,
+            "skill": skill_jaccard(jd_skills, all_cand_skills),
+            "yoe": yoe_match(min_yoe, max_yoe, cand.years_of_experience),
+            "company": company_quality_signal(
+                {
+                    "most_recent_company_is_funded": cand.most_recent_company_is_funded,
+                    "most_recent_company_is_product_company": cand.most_recent_company_is_product_company,
+                    "most_recent_company_total_funding": cand.most_recent_company_total_funding,
+                }
+            ),
+            "growth": float(cand.growth_velocity or 0.5),
+            "education": education_match(
+                {
+                    "degree": cand.degree,
+                    "education_status": cand.education_status,
+                }
+            ),
+        }
+        total = sum(w.get(k, 0) * v for k, v in components.items())
+        scored.append(
+            {
+                "candidate_id": str(cand.id),
+                "qdrant_id": qid,
+                "name": cand.name,
+                "email": cand.email,
+                "role_type": cand.role_type,
+                "engineer_type": cand.engineer_type,
+                "years_of_experience": cand.years_of_experience,
+                "most_recent_company": cand.most_recent_company,
+                "parsed_summary": cand.parsed_summary,
+                "parsed_skills": cand.parsed_skills,
+                "parsed_work_experience": cand.parsed_work_experience or [],
+                "programming_languages": cand.programming_languages or [],
+                "backend_frameworks": cand.backend_frameworks or [],
+                "frontend_technologies": cand.frontend_technologies or [],
+                "growth_velocity": cand.growth_velocity,
+                "stage1_score": round(total, 4),
+                "component_scores": {k: round(v, 4) for k, v in components.items()},
+            }
+        )
+    scored.sort(key=lambda x: x["stage1_score"], reverse=True)
+    return scored[:50]