Spaces:
Sleeping
Sleeping
File size: 3,383 Bytes
ea9ca44 ad01d65 ea9ca44 4b3a33f ea9ca44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import os
import numpy as np
from typing import List
from dotenv import load_dotenv
from supabase import create_client
from sentence_transformers import SentenceTransformer
# Load env
load_dotenv()
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
# Singleton model (same pattern as profile code)
_model = None
def get_model():
global _model
if _model is None:
print("📥 Loading BAAI/bge-m3 model for job embeddings...")
_model = SentenceTransformer("BAAI/bge-m3")
return _model
def get_supabase():
if not SUPABASE_URL or not SUPABASE_KEY:
print("❌ Missing Supabase credentials for job embeddings.")
return None
return create_client(SUPABASE_URL, SUPABASE_KEY)
# -------- Embedding helpers (IDENTICAL LOGIC) --------
def generate_embedding(text: str) -> List[float]:
if not text or not text.strip():
return [0.0] * 1024
model = get_model()
embedding = model.encode(text, normalize_embeddings=True)
return embedding.tolist()
def generate_list_embedding(items: List[str]) -> List[float]:
if not items:
return [0.0] * 1024
model = get_model()
embeddings = model.encode(items, normalize_embeddings=True)
mean_embedding = np.mean(embeddings, axis=0)
return mean_embedding.tolist()
# ----------------------------------------------------
def safe_generate_and_store_job_embeddings(client, job_id: str) -> None:
"""
Fetches job entities, generates entity-wise embeddings,
and upserts them into job_embeddings table.
"""
print(f"🧬 Generating job embeddings for Job: {job_id}")
# 1. Fetch job entities
resp = client.table("jobs_entities") \
.select("*") \
.eq("job_id", job_id) \
.execute()
if not resp.data:
print(f"⚠️ Job entities not found for job_id={job_id}")
return
entities = resp.data[0]
# 2. Parse list fields safely (same pattern)
def parse_list(val):
if not val:
return []
if isinstance(val, list):
return val
if isinstance(val, str):
return [x.strip() for x in val.split(",") if x.strip()]
return []
skills = parse_list(entities.get("skills"))
technical_skills = parse_list(entities.get("technical_skills"))
tools = parse_list(entities.get("tools"))
certifications = parse_list(entities.get("certifications"))
experience = entities.get("experience") or ""
education = entities.get("education") or ""
try:
# 3. Generate embeddings (ENTITY-WISE)
payload = {
"job_id": job_id,
"skills": generate_list_embedding(skills),
"technical_skills": generate_list_embedding(technical_skills),
"tools": generate_list_embedding(tools),
"work_experience": generate_embedding(experience),
"education": generate_embedding(education),
"certifications": generate_list_embedding(certifications),
"updated_at": "now()"
}
# 4. Upsert into job_embeddings
client.table("job_embeddings").upsert(payload).execute()
print(f"✅ Job embeddings stored for job_id={job_id}")
except Exception as e:
print(f"❌ Job embedding generation failed: {e}")
|