import json import os from dotenv import load_dotenv load_dotenv() with open('metadata.jsonl', 'r') as f: json_list = list(f) json_QA = [] for json_str in json_list: json_data = json.loads(json_str) json_QA.append(json_data) #test access to the metadata # import random # random_samples = random.sample(json_QA, 1) # for sample in random_samples: # print("=" * 50) # print(f"Task ID: {sample['task_id']}") # print(f"Question: {sample['Question']}") # print(f"Level: {sample['Level']}") # print(f"Final Answer: {sample['Final answer']}") # print(f"Annotator Metadata: ") # print(f" ├── Steps: ") # for step in sample['Annotator Metadata']['Steps'].split('\n'): # print(f" │ ├── {step}") # print(f" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}") # print(f" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}") # print(f" ├── Tools:") # for tool in sample['Annotator Metadata']['Tools'].split('\n'): # print(f" │ ├── {tool}") # print(f" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}") # print("=" * 50) #initialize the supabase client import os from dotenv import load_dotenv from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import SupabaseVectorStore from supabase.client import Client, create_client from langchain.embeddings import OpenAIEmbeddings load_dotenv() supabase_url = os.environ.get("SUPABASE_URL") supabase_key = os.environ.get("SUPABASE_KEY") supabase: Client = create_client(supabase_url, supabase_key) #setup embedding model embeddings = OpenAIEmbeddings( model="text-embedding-3-small",api_key=os.environ.get("OPENAI_KEY")) def get_embedding(text: str) -> list[float]: """Get the embedding for a given text using OpenAI's API.""" response = embeddings.embed_query(text) return response # #insert data into database # from langchain.schema import Document # docs = [] # cnt = 0 # for sample in json_QA: # content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" # doc = { # "id" : cnt, # "content" : content, # "metadata" : { # "source" : sample['task_id'] # }, # "embedding" : get_embedding(content), # } # docs.append(doc) # cnt += 1 # print(f'total number of documents: {cnt+1}') # # upload the documents to the vector database # try: # response = ( # supabase.table("documents_agent") # .insert(docs) # .execute() # ) # except Exception as exception: # print("Error inserting data into Supabase:", exception) #Check data in table and setup vectorstore # add items to vector database vector_store = SupabaseVectorStore( client=supabase, embedding= embeddings, table_name="documents_agent", query_name="match_documents", ) retriever = vector_store.as_retriever() # query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?" # # matched_docs = vector_store.similarity_search(query, k=2) # retrived_docs = retriever.invoke(query) # print(retrived_docs[0])