import os from pinecone import Pinecone from sentence_transformers import SentenceTransformer from typing import List from app.models.schemas import Candidate class MatchService: def __init__(self): self.pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) self.index_name = os.getenv("PINECONE_INDEX", "coderound") self.index = self.pc.Index(self.index_name) # Load local embedding model model_name = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3") self.model = SentenceTransformer(model_name) self.top_k = int(os.getenv("STAGE2_TOP_K", "20")) def get_embedding(self, text: str): return self.model.encode(text).tolist() async def get_top_candidates(self, jd: str, candidates: List[Candidate]) -> List[Candidate]: # 1. Prepare vectors for batch upload vectors = [] candidate_map = {} for c in candidates: # Combine fields for semantic weight search_text = f"{c.name} {c.skills} {c.experience} {c.projects} {c.resume_text}" embedding = self.get_embedding(search_text) vectors.append({ "id": c.id, "values": embedding, "metadata": {"name": c.name, "email": c.email} }) candidate_map[c.id] = c # 2. Upsert to Pinecone self.index.upsert(vectors=vectors) # 3. Embed JD and Query jd_embedding = self.get_embedding(jd) query_results = self.index.query( vector=jd_embedding, top_k=self.top_k, include_metadata=True ) # 4. Map back to Candidate objects top_candidates = [] for match in query_results.matches: if match.id in candidate_map: top_candidates.append(candidate_map[match.id]) return top_candidates match_service = MatchService()