File size: 3,383 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad01d65
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b3a33f
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import numpy as np
from typing import List
from dotenv import load_dotenv
from supabase import create_client
from sentence_transformers import SentenceTransformer

# Load env
load_dotenv()

SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")

# Singleton model (same pattern as profile code)
_model = None

def get_model():
    global _model
    if _model is None:
        print("📥 Loading BAAI/bge-m3 model for job embeddings...")
        _model = SentenceTransformer("BAAI/bge-m3")
    return _model

def get_supabase():
    if not SUPABASE_URL or not SUPABASE_KEY:
        print("❌ Missing Supabase credentials for job embeddings.")
        return None
    return create_client(SUPABASE_URL, SUPABASE_KEY)

# -------- Embedding helpers (IDENTICAL LOGIC) --------

def generate_embedding(text: str) -> List[float]:
    if not text or not text.strip():
        return [0.0] * 1024

    model = get_model()
    embedding = model.encode(text, normalize_embeddings=True)
    return embedding.tolist()

def generate_list_embedding(items: List[str]) -> List[float]:
    if not items:
        return [0.0] * 1024

    model = get_model()
    embeddings = model.encode(items, normalize_embeddings=True)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding.tolist()

# ----------------------------------------------------

def safe_generate_and_store_job_embeddings(client, job_id: str) -> None:
    """
    Fetches job entities, generates entity-wise embeddings,
    and upserts them into job_embeddings table.
    """
    print(f"🧬 Generating job embeddings for Job: {job_id}")

    # 1. Fetch job entities
    resp = client.table("jobs_entities") \
        .select("*") \
        .eq("job_id", job_id) \
        .execute()

    if not resp.data:
        print(f"⚠️ Job entities not found for job_id={job_id}")
        return

    entities = resp.data[0]

    # 2. Parse list fields safely (same pattern)
    def parse_list(val):
        if not val:
            return []
        if isinstance(val, list):
            return val
        if isinstance(val, str):
            return [x.strip() for x in val.split(",") if x.strip()]
        return []

    skills = parse_list(entities.get("skills"))
    technical_skills = parse_list(entities.get("technical_skills"))
    tools = parse_list(entities.get("tools"))
    certifications = parse_list(entities.get("certifications"))

    experience = entities.get("experience") or ""
    education = entities.get("education") or ""

    try:
        # 3. Generate embeddings (ENTITY-WISE)
        payload = {
            "job_id": job_id,
            "skills": generate_list_embedding(skills),
            "technical_skills": generate_list_embedding(technical_skills),
            "tools": generate_list_embedding(tools),
            "work_experience": generate_embedding(experience),
            "education": generate_embedding(education),
            "certifications": generate_list_embedding(certifications),
            "updated_at": "now()"
        }

        # 4. Upsert into job_embeddings
        client.table("job_embeddings").upsert(payload).execute()
        print(f"✅ Job embeddings stored for job_id={job_id}")

    except Exception as e:
        print(f"❌ Job embedding generation failed: {e}")