|
|
|
|
|
import os |
|
|
import json |
|
|
from dotenv import load_dotenv |
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
from supabase.client import Client, create_client |
|
|
|
|
|
def upload_data_to_supabase(): |
|
|
""" |
|
|
Reads data from metadata.jsonl, generates embeddings, |
|
|
and uploads it to a Supabase table named 'documents'. |
|
|
""" |
|
|
|
|
|
print("Loading configuration...") |
|
|
load_dotenv() |
|
|
|
|
|
supabase_url = os.environ.get("SUPABASE_URL") |
|
|
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY") |
|
|
|
|
|
if not supabase_url or not supabase_key: |
|
|
print("Error: SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in your .env file.") |
|
|
return |
|
|
|
|
|
|
|
|
print("Loading data from metadata.jsonl...") |
|
|
try: |
|
|
with open('metadata.jsonl', 'r', encoding='utf-8') as jsonl_file: |
|
|
json_list = list(jsonl_file) |
|
|
|
|
|
json_QA = [] |
|
|
for json_str in json_list: |
|
|
json_QA.append(json.loads(json_str)) |
|
|
print(f"Successfully loaded {len(json_QA)} records from metadata.jsonl.") |
|
|
except FileNotFoundError: |
|
|
print("Error: metadata.jsonl not found. Make sure it is in the same directory.") |
|
|
return |
|
|
except Exception as e: |
|
|
print(f"Error reading metadata.jsonl: {e}") |
|
|
return |
|
|
|
|
|
|
|
|
print("Connecting to Supabase and initializing embeddings model...") |
|
|
try: |
|
|
supabase: Client = create_client(supabase_url, supabase_key) |
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") |
|
|
print("Connection and initialization successful.") |
|
|
except Exception as e: |
|
|
print(f"Error during initialization: {e}") |
|
|
return |
|
|
|
|
|
|
|
|
print("Preparing documents and generating embeddings. This may take a few minutes...") |
|
|
docs_to_upload = [] |
|
|
for i, sample in enumerate(json_QA): |
|
|
|
|
|
content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" |
|
|
|
|
|
|
|
|
embedding = embeddings.embed_query(content) |
|
|
|
|
|
|
|
|
doc = { |
|
|
"content": content, |
|
|
"metadata": {"source": sample['task_id']}, |
|
|
"embedding": embedding |
|
|
} |
|
|
docs_to_upload.append(doc) |
|
|
|
|
|
|
|
|
if (i + 1) % 10 == 0: |
|
|
print(f"Processed {i + 1}/{len(json_QA)} documents...") |
|
|
|
|
|
print("All documents have been processed.") |
|
|
|
|
|
|
|
|
print("Uploading documents to Supabase...") |
|
|
try: |
|
|
response = supabase.table("documents").insert(docs_to_upload).execute() |
|
|
print("\n--- Success! ---") |
|
|
print(f"Successfully uploaded {len(docs_to_upload)} documents to your Supabase table.") |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
print("\n--- Error during upload ---") |
|
|
print(f"An error occurred while uploading to Supabase: {e}") |
|
|
print("Please check your Supabase table schema and permissions.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
upload_data_to_supabase() |