iris_backend / backend /src /embeddings /job_embed.py
Saandraahh's picture
Implemented clustering
4b3a33f
import os
import numpy as np
from typing import List
from dotenv import load_dotenv
from supabase import create_client
from sentence_transformers import SentenceTransformer
# Load env
load_dotenv()
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
# Singleton model (same pattern as profile code)
_model = None
def get_model():
global _model
if _model is None:
print("πŸ“₯ Loading BAAI/bge-m3 model for job embeddings...")
_model = SentenceTransformer("BAAI/bge-m3")
return _model
def get_supabase():
if not SUPABASE_URL or not SUPABASE_KEY:
print("❌ Missing Supabase credentials for job embeddings.")
return None
return create_client(SUPABASE_URL, SUPABASE_KEY)
# -------- Embedding helpers (IDENTICAL LOGIC) --------
def generate_embedding(text: str) -> List[float]:
if not text or not text.strip():
return [0.0] * 1024
model = get_model()
embedding = model.encode(text, normalize_embeddings=True)
return embedding.tolist()
def generate_list_embedding(items: List[str]) -> List[float]:
if not items:
return [0.0] * 1024
model = get_model()
embeddings = model.encode(items, normalize_embeddings=True)
mean_embedding = np.mean(embeddings, axis=0)
return mean_embedding.tolist()
# ----------------------------------------------------
def safe_generate_and_store_job_embeddings(client, job_id: str) -> None:
"""
Fetches job entities, generates entity-wise embeddings,
and upserts them into job_embeddings table.
"""
print(f"🧬 Generating job embeddings for Job: {job_id}")
# 1. Fetch job entities
resp = client.table("jobs_entities") \
.select("*") \
.eq("job_id", job_id) \
.execute()
if not resp.data:
print(f"⚠️ Job entities not found for job_id={job_id}")
return
entities = resp.data[0]
# 2. Parse list fields safely (same pattern)
def parse_list(val):
if not val:
return []
if isinstance(val, list):
return val
if isinstance(val, str):
return [x.strip() for x in val.split(",") if x.strip()]
return []
skills = parse_list(entities.get("skills"))
technical_skills = parse_list(entities.get("technical_skills"))
tools = parse_list(entities.get("tools"))
certifications = parse_list(entities.get("certifications"))
experience = entities.get("experience") or ""
education = entities.get("education") or ""
try:
# 3. Generate embeddings (ENTITY-WISE)
payload = {
"job_id": job_id,
"skills": generate_list_embedding(skills),
"technical_skills": generate_list_embedding(technical_skills),
"tools": generate_list_embedding(tools),
"work_experience": generate_embedding(experience),
"education": generate_embedding(education),
"certifications": generate_list_embedding(certifications),
"updated_at": "now()"
}
# 4. Upsert into job_embeddings
client.table("job_embeddings").upsert(payload).execute()
print(f"βœ… Job embeddings stored for job_id={job_id}")
except Exception as e:
print(f"❌ Job embedding generation failed: {e}")