Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import numpy as np | |
| from typing import List, Any | |
| from dotenv import load_dotenv | |
| from supabase import create_client | |
| from sentence_transformers import SentenceTransformer | |
| # Load env | |
| load_dotenv() | |
| SUPABASE_URL = os.environ.get("SUPABASE_URL") | |
| SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY") | |
| # Initialize Model (Globals are bad but efficient for serverless-ish/script use) | |
| # Using a singleton pattern to avoid reloading model on every call if imported | |
| _model = None | |
| def get_model(): | |
| global _model | |
| if _model is None: | |
| print("📥 Loading BAAI/bge-m3 model...") | |
| _model = SentenceTransformer('BAAI/bge-m3') | |
| return _model | |
| def get_supabase(): | |
| if not SUPABASE_URL or not SUPABASE_KEY: | |
| print("❌ Missing Supabase credentials for embeddings.") | |
| return None | |
| return create_client(SUPABASE_URL, SUPABASE_KEY) | |
| def generate_embedding(text: str) -> List[float]: | |
| if not text or not text.strip(): | |
| return [0.0] * 1024 # BGE-M3 is 1024d | |
| model = get_model() | |
| # BGE-M3 returns 1024 dim | |
| embedding = model.encode(text, normalize_embeddings=True) | |
| return embedding.tolist() | |
| def generate_list_embedding(items: List[str]) -> List[float]: | |
| if not items: | |
| return [0.0] * 1024 | |
| model = get_model() | |
| embeddings = model.encode(items, normalize_embeddings=True) | |
| # Mean pooling | |
| mean_embedding = np.mean(embeddings, axis=0) | |
| return mean_embedding.tolist() | |
| def safe_generate_and_store_embeddings(client, user_id: str) -> None: | |
| """ | |
| Fetches profile data, generates embeddings, and upserts to profile_embeddings. | |
| """ | |
| print(f"🧬 Generating embeddings for User: {user_id}") | |
| # 1. Fetch Profile | |
| resp = client.table("profiles").select("*").eq("id", user_id).execute() | |
| if not resp.data: | |
| print(f"⚠️ Profile not found for {user_id}") | |
| return | |
| profile = resp.data[0] | |
| # 2. Extract Fields | |
| # Text fields | |
| summary = profile.get("summary") or "" | |
| headline = profile.get("headline") or "" | |
| role = profile.get("role") or "" | |
| # Lists (CSV or Array) - Handle both just in case | |
| def parse_list(val): | |
| if not val: return [] | |
| if isinstance(val, list): return val | |
| if isinstance(val, str): return [x.strip() for x in val.split(",") if x.strip()] | |
| return [] | |
| skills = parse_list(profile.get("skills")) | |
| tech_skills = parse_list(profile.get("technical_skills")) | |
| # For experience and education, we might need more complex parsing if stored as JSONB | |
| # But for now let's assume simple text representation or skip if complex JSON | |
| # If experience is JSONB, we'll serialize it to text for embedding | |
| experience_raw = profile.get("work_experience") or [] | |
| if isinstance(experience_raw, list): | |
| # It's a list of objects or strings. Convert to list of strings. | |
| experience_texts = [] | |
| for item in experience_raw: | |
| if isinstance(item, dict): | |
| # Flatten: "Role at Company (Year): Description" | |
| role_ = item.get("role") or "" | |
| comp_ = item.get("company") or "" | |
| desc_ = item.get("description") or "" | |
| text = f"{role_} at {comp_}. {desc_}" | |
| experience_texts.append(text) | |
| elif isinstance(item, str): | |
| experience_texts.append(item) | |
| experience = experience_texts | |
| else: | |
| experience = [] | |
| # 3. Generate Embeddings (Extra fields for completeness) | |
| certifications = parse_list(profile.get("certifications")) | |
| try: | |
| current_position_emb = generate_embedding(f"{role} {headline}") | |
| summary_emb = generate_embedding(summary) | |
| skills_emb = generate_list_embedding(skills) | |
| technical_skills_emb = generate_list_embedding(tech_skills) | |
| experience_emb = generate_list_embedding(experience) | |
| certifications_emb = generate_list_embedding(certifications) | |
| # 4. Upsert | |
| # Matches columns in create_profile_embeddings.sql | |
| payload = { | |
| "id": user_id, | |
| "headline": current_position_emb, | |
| "summary": summary_emb, | |
| "skills": skills_emb, | |
| "technical_skills": technical_skills_emb, | |
| "experience": experience_emb, | |
| "certifications": certifications_emb, | |
| "updated_at": "now()" | |
| } | |
| client.table("profile_embeddings").upsert(payload).execute() | |
| print(f"✅ Embeddings stored for {user_id}") | |
| except Exception as e: | |
| print(f"❌ Embedding generation failed: {e}") | |
| if __name__ == "__main__": | |
| # Test run | |
| sb = get_supabase() | |
| if sb: | |
| # Replace with a valid ID for testing if needed | |
| pass | |