File size: 3,488 Bytes
7018286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# upload_data.py
import os
import json
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from supabase.client import Client, create_client

def upload_data_to_supabase():
    """
    Reads data from metadata.jsonl, generates embeddings,
    and uploads it to a Supabase table named 'documents'.
    """
    # --- 1. Load Environment and Configuration ---
    print("Loading configuration...")
    load_dotenv()

    supabase_url = os.environ.get("SUPABASE_URL")
    supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")

    if not supabase_url or not supabase_key:
        print("Error: SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in your .env file.")
        return

    # --- 2. Load the Local Data ---
    print("Loading data from metadata.jsonl...")
    try:
        with open('metadata.jsonl', 'r', encoding='utf-8') as jsonl_file:
            json_list = list(jsonl_file)
        
        json_QA = []
        for json_str in json_list:
            json_QA.append(json.loads(json_str))
        print(f"Successfully loaded {len(json_QA)} records from metadata.jsonl.")
    except FileNotFoundError:
        print("Error: metadata.jsonl not found. Make sure it is in the same directory.")
        return
    except Exception as e:
        print(f"Error reading metadata.jsonl: {e}")
        return

    # --- 3. Initialize Supabase Client and Embeddings Model ---
    print("Connecting to Supabase and initializing embeddings model...")
    try:
        supabase: Client = create_client(supabase_url, supabase_key)
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        print("Connection and initialization successful.")
    except Exception as e:
        print(f"Error during initialization: {e}")
        return

    # --- 4. Prepare Documents for Upload ---
    print("Preparing documents and generating embeddings. This may take a few minutes...")
    docs_to_upload = []
    for i, sample in enumerate(json_QA):
        # Create the main content string
        content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
        
        # Create the vector embedding for the content
        embedding = embeddings.embed_query(content)
        
        # Create the structured document for upload
        doc = {
            "content": content,
            "metadata": {"source": sample['task_id']},  # This is now a proper JSON object
            "embedding": embedding
        }
        docs_to_upload.append(doc)
        
        # Optional: Print progress
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(json_QA)} documents...")
            
    print("All documents have been processed.")

    # --- 5. Upload to Supabase ---
    print("Uploading documents to Supabase...")
    try:
        response = supabase.table("documents").insert(docs_to_upload).execute()
        print("\n--- Success! ---")
        print(f"Successfully uploaded {len(docs_to_upload)} documents to your Supabase table.")
        # The 'response' object from Supabase V2 doesn't contain a simple count,
        # but a successful execution with no errors means the data is there.
    except Exception as e:
        print("\n--- Error during upload ---")
        print(f"An error occurred while uploading to Supabase: {e}")
        print("Please check your Supabase table schema and permissions.")

if __name__ == "__main__":
    upload_data_to_supabase()