Spaces:

Golfn
/

Agent_Course_Eval

Sleeping

App Files Files Community

Agent_Course_Eval / upload_metadata_n_setup_retrivers.py

Golfn

rename

fd14a05 9 months ago

raw

history blame contribute delete

3.48 kB

	import json
	import os
	from dotenv import load_dotenv
	load_dotenv()
	with open('metadata.jsonl', 'r') as f:
	json_list = list(f)

	json_QA = []
	for json_str in json_list:
	json_data = json.loads(json_str)
	json_QA.append(json_data)

	#test access to the metadata
	# import random
	# random_samples = random.sample(json_QA, 1)
	# for sample in random_samples:
	# print("=" * 50)
	# print(f"Task ID: {sample['task_id']}")
	# print(f"Question: {sample['Question']}")
	# print(f"Level: {sample['Level']}")
	# print(f"Final Answer: {sample['Final answer']}")
	# print(f"Annotator Metadata: ")
	# print(f" ├── Steps: ")
	# for step in sample['Annotator Metadata']['Steps'].split('\n'):
	# print(f" │ ├── {step}")
	# print(f" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}")
	# print(f" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}")
	# print(f" ├── Tools:")
	# for tool in sample['Annotator Metadata']['Tools'].split('\n'):
	# print(f" │ ├── {tool}")
	# print(f" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}")
	# print("=" * 50)
	#initialize the supabase client
	import os
	from dotenv import load_dotenv
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores import SupabaseVectorStore
	from supabase.client import Client, create_client
	from langchain.embeddings import OpenAIEmbeddings

	load_dotenv()

	supabase_url = os.environ.get("SUPABASE_URL")
	supabase_key = os.environ.get("SUPABASE_KEY")
	supabase: Client = create_client(supabase_url, supabase_key)

	#setup embedding model
	embeddings = OpenAIEmbeddings(
	model="text-embedding-3-small",api_key=os.environ.get("OPENAI_KEY"))

	def get_embedding(text: str) -> list[float]:
	"""Get the embedding for a given text using OpenAI's API."""
	response = embeddings.embed_query(text)
	return response

	# #insert data into database
	# from langchain.schema import Document
	# docs = []
	# cnt = 0
	# for sample in json_QA:
	# content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
	# doc = {
	# "id" : cnt,
	# "content" : content,
	# "metadata" : {
	# "source" : sample['task_id']
	# },
	# "embedding" : get_embedding(content),
	# }
	# docs.append(doc)
	# cnt += 1
	# print(f'total number of documents: {cnt+1}')
	# # upload the documents to the vector database
	# try:
	# response = (
	# supabase.table("documents_agent")
	# .insert(docs)
	# .execute()
	# )
	# except Exception as exception:
	# print("Error inserting data into Supabase:", exception)

	#Check data in table and setup vectorstore
	# add items to vector database
	vector_store = SupabaseVectorStore(
	client=supabase,
	embedding= embeddings,
	table_name="documents_agent",
	query_name="match_documents",
	)
	retriever = vector_store.as_retriever()

	# query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"
	# # matched_docs = vector_store.similarity_search(query, k=2)
	# retrived_docs = retriever.invoke(query)
	# print(retrived_docs[0])