import os import json from supabase import create_client from sentence_transformers import SentenceTransformer # Load environment variables SUPABASE_URL = os.environ["SUPABASE_URL"] SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"] # Connect to Supabase supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) # Load the sentence-transformer model embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") # Load the .jsonl file with open("data/metadata.jsonl", "r") as f: for line in f: item = json.loads(line) question = item.get("Question") answer = item.get("Final answer", "") # Format content like LangChain expects content = f"Question: {question}\nAnswer: {answer}" embedding = embedder.encode(content).tolist() # Optional metadata, remove large fields like step-by-step details if not needed metadata = { "task_id": item.get("task_id"), "level": item.get("Level"), "file_name": item.get("file_name"), "annotator_metadata": item.get("Annotator Metadata", {}) } # Insert into Supabase supabase.table("documents").insert({ "content": content, "embedding": embedding, "metadata": metadata }).execute() print(f"✅ Inserted: {item['task_id']}")