pavan-d commited on
Commit
68ac8d1
·
verified ·
1 Parent(s): da5a304

Create load_vectorstore.py

Browse files
Files changed (1) hide show
  1. load_vectorstore.py +43 -0
load_vectorstore.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from supabase import create_client
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ # Load environment variables
7
+ SUPABASE_URL = os.environ["SUPABASE_URL"]
8
+ SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"]
9
+
10
+ # Connect to Supabase
11
+ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
12
+
13
+ # Load the sentence-transformer model
14
+ embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
15
+
16
+ # Load the .jsonl file
17
+ with open("data/metadata.jsonl", "r") as f:
18
+ for line in f:
19
+ item = json.loads(line)
20
+
21
+ question = item.get("Question")
22
+ answer = item.get("Final answer", "")
23
+
24
+ # Format content like LangChain expects
25
+ content = f"Question: {question}\nAnswer: {answer}"
26
+ embedding = embedder.encode(content).tolist()
27
+
28
+ # Optional metadata, remove large fields like step-by-step details if not needed
29
+ metadata = {
30
+ "task_id": item.get("task_id"),
31
+ "level": item.get("Level"),
32
+ "file_name": item.get("file_name"),
33
+ "annotator_metadata": item.get("Annotator Metadata", {})
34
+ }
35
+
36
+ # Insert into Supabase
37
+ supabase.table("documents").insert({
38
+ "content": content,
39
+ "embedding": embedding,
40
+ "metadata": metadata
41
+ }).execute()
42
+
43
+ print(f"✅ Inserted: {item['task_id']}")