AgentsCourseAssignment / upload_data.py
itsskofficial's picture
added util files
7018286
raw
history blame
3.49 kB
# upload_data.py
import os
import json
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from supabase.client import Client, create_client
def upload_data_to_supabase():
"""
Reads data from metadata.jsonl, generates embeddings,
and uploads it to a Supabase table named 'documents'.
"""
# --- 1. Load Environment and Configuration ---
print("Loading configuration...")
load_dotenv()
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
if not supabase_url or not supabase_key:
print("Error: SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in your .env file.")
return
# --- 2. Load the Local Data ---
print("Loading data from metadata.jsonl...")
try:
with open('metadata.jsonl', 'r', encoding='utf-8') as jsonl_file:
json_list = list(jsonl_file)
json_QA = []
for json_str in json_list:
json_QA.append(json.loads(json_str))
print(f"Successfully loaded {len(json_QA)} records from metadata.jsonl.")
except FileNotFoundError:
print("Error: metadata.jsonl not found. Make sure it is in the same directory.")
return
except Exception as e:
print(f"Error reading metadata.jsonl: {e}")
return
# --- 3. Initialize Supabase Client and Embeddings Model ---
print("Connecting to Supabase and initializing embeddings model...")
try:
supabase: Client = create_client(supabase_url, supabase_key)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
print("Connection and initialization successful.")
except Exception as e:
print(f"Error during initialization: {e}")
return
# --- 4. Prepare Documents for Upload ---
print("Preparing documents and generating embeddings. This may take a few minutes...")
docs_to_upload = []
for i, sample in enumerate(json_QA):
# Create the main content string
content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
# Create the vector embedding for the content
embedding = embeddings.embed_query(content)
# Create the structured document for upload
doc = {
"content": content,
"metadata": {"source": sample['task_id']}, # This is now a proper JSON object
"embedding": embedding
}
docs_to_upload.append(doc)
# Optional: Print progress
if (i + 1) % 10 == 0:
print(f"Processed {i + 1}/{len(json_QA)} documents...")
print("All documents have been processed.")
# --- 5. Upload to Supabase ---
print("Uploading documents to Supabase...")
try:
response = supabase.table("documents").insert(docs_to_upload).execute()
print("\n--- Success! ---")
print(f"Successfully uploaded {len(docs_to_upload)} documents to your Supabase table.")
# The 'response' object from Supabase V2 doesn't contain a simple count,
# but a successful execution with no errors means the data is there.
except Exception as e:
print("\n--- Error during upload ---")
print(f"An error occurred while uploading to Supabase: {e}")
print("Please check your Supabase table schema and permissions.")
if __name__ == "__main__":
upload_data_to_supabase()