Spaces:

ketannnn
/

coderound

Sleeping

App Files Files Community

ketannnn commited on Apr 12

Commit

bdaeeeb

1 Parent(s): 1ca9ba2

feat: add embedding singleton and batch ingest Celery worker

Browse files

Files changed (6) hide show

backend/src/ml/__init__.py +1 -0
backend/src/ml/embedder.py +34 -0
backend/src/ml/feature_builder.py +251 -0
backend/src/ml/reranker.py +23 -0
backend/src/workers/__init__.py +1 -0
backend/src/workers/celery_app.py +24 -0

backend/src/ml/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ml package

backend/src/ml/embedder.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import hashlib
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from ..config import get_settings
+_model: SentenceTransformer | None = None
+INSTRUCTION = "Represent this candidate profile for matching job descriptions: "
+def _get_model() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        settings = get_settings()
+        _model = SentenceTransformer(settings.embedding_model)
+    return _model
+def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray:
+    model = _get_model()
+    if instruction:
+        texts = [INSTRUCTION + t for t in texts]
+    embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
+    return embeddings.astype(np.float32)
+def embed_query(text: str) -> np.ndarray:
+    model = _get_model()
+    query = "Represent this job description for retrieving matching candidates: " + text
+    emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
+    return emb[0].astype(np.float32)
+def compute_text_hash(text: str) -> str:
+    return hashlib.sha256(text.encode()).hexdigest()[:32]

backend/src/ml/feature_builder.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import re
+import math
+from typing import Any
+SENIORITY_MAP = {
+    "intern": 0, "trainee": 0, "junior": 1, "associate": 1,
+    "mid": 2, "senior": 3, "lead": 4, "staff": 4,
+    "principal": 5, "architect": 5, "manager": 4, "director": 6, "vp": 7, "cto": 8,
+}
+TIER1_EDU = {"iit", "iim", "nit", "bits", "iiit", "mit", "stanford", "cmu", "berkeley"}
+def build_candidate_text(candidate: dict[str, Any]) -> str:
+    parts = []
+    if candidate.get("parsed_summary"):
+        parts.append(candidate["parsed_summary"])
+    if candidate.get("parsed_skills"):
+        parts.append(f"Skills: {candidate['parsed_skills']}")
+    langs = candidate.get("programming_languages") or []
+    if langs:
+        parts.append(f"Languages: {', '.join(langs)}")
+    frameworks = (candidate.get("backend_frameworks") or []) + (candidate.get("frontend_technologies") or [])
+    if frameworks:
+        parts.append(f"Frameworks: {', '.join(frameworks)}")
+    work_exp = candidate.get("parsed_work_experience") or []
+    for we in work_exp[:3]:
+        if isinstance(we, dict):
+            desc = we.get("description") or we.get("role") or ""
+            company = we.get("company") or ""
+            if desc or company:
+                parts.append(f"{company}: {desc}".strip(": "))
+    if candidate.get("most_recent_company_description"):
+        parts.append(candidate["most_recent_company_description"])
+    return " | ".join(filter(None, parts))
+def _parse_duration_months(entry: dict) -> float:
+    duration = entry.get("duration") or entry.get("tenure") or ""
+    if not duration:
+        return 12.0
+    years = re.findall(r"(\d+\.?\d*)\s*(?:year|yr)", duration, re.IGNORECASE)
+    months = re.findall(r"(\d+\.?\d*)\s*(?:month|mo)", duration, re.IGNORECASE)
+    total = sum(float(y) * 12 for y in years) + sum(float(m) for m in months)
+    return total if total > 0 else 12.0
+def _extract_seniority(title: str) -> int:
+    title_lower = title.lower()
+    for key, val in sorted(SENIORITY_MAP.items(), key=lambda x: -x[1]):
+        if key in title_lower:
+            return val
+    return 2
+def compute_growth_velocity(work_experience: list[dict], is_funded: bool = False) -> float:
+    if not work_experience or len(work_experience) < 2:
+        base = 0.6 if is_funded else 0.5
+        return base
+    entries = sorted(work_experience, key=lambda x: x.get("start_date", "") or "")
+    seniority_levels = []
+    total_months = 0.0
+    for entry in entries:
+        if not isinstance(entry, dict):
+            continue
+        title = entry.get("title") or entry.get("role") or ""
+        seniority_levels.append(_extract_seniority(title))
+        total_months += _parse_duration_months(entry)
+    if len(seniority_levels) < 2:
+        return 0.5
+    seniority_gain = seniority_levels[-1] - seniority_levels[0]
+    years_elapsed = max(total_months / 12, 0.5)
+    velocity = seniority_gain / years_elapsed
+    normalized = min(max((velocity + 1) / 3, 0.0), 1.0)
+    if is_funded:
+        normalized = min(normalized + 0.1, 1.0)
+    return round(normalized, 4)
+def skill_jaccard(jd_skills: list[str], candidate_skills: list[str]) -> float:
+    if not jd_skills:
+        return 0.5
+    jd_set = {s.lower().strip() for s in jd_skills if s}
+    cand_set = {s.lower().strip() for s in candidate_skills if s}
+    if not cand_set:
+        return 0.0
+    intersection = jd_set & cand_set
+    union = jd_set | cand_set
+    return len(intersection) / len(union) if union else 0.0
+def yoe_match(min_yoe: float | None, max_yoe: float | None, candidate_yoe: float | None) -> float:
+    if candidate_yoe is None:
+        return 0.5
+    if min_yoe is None and max_yoe is None:
+        return 0.7
+    candidate_yoe = float(candidate_yoe)
+    if min_yoe is not None and candidate_yoe < min_yoe:
+        gap = min_yoe - candidate_yoe
+        return max(0.0, 1.0 - gap * 0.2)
+    if max_yoe is not None and candidate_yoe > max_yoe + 3:
+        return 0.7
+    return 1.0
+def company_quality_signal(candidate: dict[str, Any]) -> float:
+    score = 0.5
+    if candidate.get("most_recent_company_is_product_company"):
+        score += 0.2
+    if candidate.get("most_recent_company_is_funded"):
+        score += 0.15
+    funding = candidate.get("most_recent_company_total_funding") or 0
+    if funding > 10_000_000:
+        score += 0.1
+    if funding > 100_000_000:
+        score += 0.05
+    return min(score, 1.0)
+def education_match(candidate: dict[str, Any]) -> float:
+    degree = (candidate.get("degree") or "").lower()
+    status = (candidate.get("education_status") or "").lower()
+    score = 0.5
+    if "bachelor" in degree or "b.tech" in degree or "be " in degree:
+        score = 0.6
+    if "master" in degree or "m.tech" in degree or "mba" in degree:
+        score = 0.8
+    if "phd" in degree or "doctorate" in degree:
+        score = 0.9
+    for uni in TIER1_EDU:
+        if uni in degree or uni in status:
+            score = min(score + 0.15, 1.0)
+            break
+    return score
+def compute_jd_quality(raw_text: str, parsed: dict[str, Any], candidate_count: int = 0) -> dict[str, Any]:
+    required_skills = parsed.get("required_skills") or []
+    skill_count = len(required_skills)
+    vagueness_score = 1.0
+    if skill_count >= 5:
+        vagueness_score = 0.2
+    elif skill_count >= 3:
+        vagueness_score = 0.5
+    elif skill_count >= 1:
+        vagueness_score = 0.75
+    word_count = len(raw_text.split())
+    if word_count < 50:
+        vagueness_score = min(vagueness_score + 0.3, 1.0)
+    contradictions = []
+    min_yoe = parsed.get("min_yoe")
+    engineer_type = (parsed.get("engineer_type") or "").lower()
+    if min_yoe and min_yoe >= 5 and "junior" in raw_text.lower():
+        contradictions.append("Requires 5+ YOE but mentions junior role")
+    if min_yoe and min_yoe <= 1 and "senior" in raw_text.lower():
+        contradictions.append("Entry-level YOE but expects senior candidate")
+    breadth_score = 0.0
+    if candidate_count > 0 and skill_count < 2:
+        breadth_score = 0.9
+    warnings = []
+    if vagueness_score > 0.6:
+        warnings.append("JD is too vague — add more specific skill requirements for better match quality")
+    if contradictions:
+        warnings.append(f"Contradictions detected: {'; '.join(contradictions)}")
+    if breadth_score > 0.7:
+        warnings.append("Requirements are too broad — almost all candidates will match")
+    overall = "good"
+    if vagueness_score > 0.6 or contradictions or breadth_score > 0.7:
+        overall = "poor"
+    elif vagueness_score > 0.35:
+        overall = "fair"
+    return {
+        "overall": overall,
+        "vagueness_score": round(vagueness_score, 3),
+        "breadth_score": round(breadth_score, 3),
+        "skill_count": skill_count,
+        "contradictions": contradictions,
+        "warnings": warnings,
+    }
+def parse_jd_requirements(raw_text: str) -> dict[str, Any]:
+    skills = []
+    skill_patterns = [
+        r"\b(python|javascript|typescript|java|go|golang|rust|c\+\+|ruby|php|scala|kotlin|swift)\b",
+        r"\b(react|angular|vue|nextjs|fastapi|django|flask|express|springboot|rails)\b",
+        r"\b(postgresql|mysql|mongodb|redis|elasticsearch|kafka|rabbitmq|cassandra)\b",
+        r"\b(aws|gcp|azure|docker|kubernetes|terraform|ansible|ci\/cd|devops)\b",
+        r"\b(machine learning|deep learning|nlp|llm|rag|vector|embedding|pytorch|tensorflow)\b",
+        r"\b(sql|nosql|graphql|rest|grpc|microservices|api)\b",
+    ]
+    for pattern in skill_patterns:
+        found = re.findall(pattern, raw_text, re.IGNORECASE)
+        skills.extend([f.lower() for f in found])
+    skills = list(dict.fromkeys(skills))
+    yoe_match_obj = re.search(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)", raw_text, re.IGNORECASE)
+    min_yoe = float(yoe_match_obj.group(1)) if yoe_match_obj else None
+    role_type = None
+    if re.search(r"\bfull.?time\b", raw_text, re.IGNORECASE):
+        role_type = "full-time"
+    elif re.search(r"\bcontract\b", raw_text, re.IGNORECASE):
+        role_type = "contract"
+    elif re.search(r"\bpart.?time\b", raw_text, re.IGNORECASE):
+        role_type = "part-time"
+    engineer_type = None
+    if re.search(r"\bbackend\b", raw_text, re.IGNORECASE):
+        engineer_type = "backend"
+    elif re.search(r"\bfrontend\b", raw_text, re.IGNORECASE):
+        engineer_type = "frontend"
+    elif re.search(r"\bfullstack\b|full.?stack\b", raw_text, re.IGNORECASE):
+        engineer_type = "fullstack"
+    elif re.search(r"\bai\s+engineer|ml\s+engineer|machine\s+learning", raw_text, re.IGNORECASE):
+        engineer_type = "ai"
+    elif re.search(r"\bdata\s+engineer\b", raw_text, re.IGNORECASE):
+        engineer_type = "data"
+    remote_allowed = bool(re.search(r"\bremote\b", raw_text, re.IGNORECASE))
+    location_match = re.search(
+        r"\b(bangalore|mumbai|delhi|hyderabad|chennai|pune|kolkata|remote|india|us|usa|uk|london|new york|san francisco)\b",
+        raw_text, re.IGNORECASE
+    )
+    location = location_match.group(0).title() if location_match else None
+    return {
+        "required_skills": skills,
+        "min_yoe": min_yoe,
+        "max_yoe": None,
+        "role_type": role_type,
+        "engineer_type": engineer_type,
+        "remote_allowed": remote_allowed,
+        "location": location,
+    }

backend/src/ml/reranker.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from FlagEmbedding import FlagReranker
+from ..config import get_settings
+_reranker: FlagReranker | None = None
+def _get_reranker() -> FlagReranker:
+    global _reranker
+    if _reranker is None:
+        settings = get_settings()
+        _reranker = FlagReranker(settings.reranker_model, use_fp16=False)
+    return _reranker
+def rerank(query: str, passages: list[str]) -> list[float]:
+    if not passages:
+        return []
+    reranker = _get_reranker()
+    pairs = [[query, p] for p in passages]
+    scores = reranker.compute_score(pairs, normalize=True)
+    if isinstance(scores, float):
+        scores = [scores]
+    return [float(s) for s in scores]

backend/src/workers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # workers package

backend/src/workers/celery_app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+from celery import Celery
+from ..config import get_settings
+settings = get_settings()
+celery_app = Celery(
+    "talentpulse",
+    broker=settings.redis_url,
+    backend=settings.redis_url,
+    include=["src.workers.ingest"],
+)
+celery_app.conf.update(
+    task_serializer="json",
+    accept_content=["json"],
+    result_serializer="json",
+    timezone="UTC",
+    enable_utc=True,
+    task_track_started=True,
+    result_expires=3600,
+    worker_prefetch_multiplier=1,
+    task_acks_late=True,
+)