Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| with open('metadata.jsonl', 'r') as f: | |
| json_list = list(f) | |
| json_QA = [] | |
| for json_str in json_list: | |
| json_data = json.loads(json_str) | |
| json_QA.append(json_data) | |
| #test access to the metadata | |
| # import random | |
| # random_samples = random.sample(json_QA, 1) | |
| # for sample in random_samples: | |
| # print("=" * 50) | |
| # print(f"Task ID: {sample['task_id']}") | |
| # print(f"Question: {sample['Question']}") | |
| # print(f"Level: {sample['Level']}") | |
| # print(f"Final Answer: {sample['Final answer']}") | |
| # print(f"Annotator Metadata: ") | |
| # print(f" βββ Steps: ") | |
| # for step in sample['Annotator Metadata']['Steps'].split('\n'): | |
| # print(f" β βββ {step}") | |
| # print(f" βββ Number of steps: {sample['Annotator Metadata']['Number of steps']}") | |
| # print(f" βββ How long did this take?: {sample['Annotator Metadata']['How long did this take?']}") | |
| # print(f" βββ Tools:") | |
| # for tool in sample['Annotator Metadata']['Tools'].split('\n'): | |
| # print(f" β βββ {tool}") | |
| # print(f" βββ Number of tools: {sample['Annotator Metadata']['Number of tools']}") | |
| # print("=" * 50) | |
| #initialize the supabase client | |
| import os | |
| from dotenv import load_dotenv | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import SupabaseVectorStore | |
| from supabase.client import Client, create_client | |
| from langchain.embeddings import OpenAIEmbeddings | |
| load_dotenv() | |
| supabase_url = os.environ.get("SUPABASE_URL") | |
| supabase_key = os.environ.get("SUPABASE_KEY") | |
| supabase: Client = create_client(supabase_url, supabase_key) | |
| #setup embedding model | |
| embeddings = OpenAIEmbeddings( | |
| model="text-embedding-3-small",api_key=os.environ.get("OPENAI_KEY")) | |
| def get_embedding(text: str) -> list[float]: | |
| """Get the embedding for a given text using OpenAI's API.""" | |
| response = embeddings.embed_query(text) | |
| return response | |
| # #insert data into database | |
| # from langchain.schema import Document | |
| # docs = [] | |
| # cnt = 0 | |
| # for sample in json_QA: | |
| # content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" | |
| # doc = { | |
| # "id" : cnt, | |
| # "content" : content, | |
| # "metadata" : { | |
| # "source" : sample['task_id'] | |
| # }, | |
| # "embedding" : get_embedding(content), | |
| # } | |
| # docs.append(doc) | |
| # cnt += 1 | |
| # print(f'total number of documents: {cnt+1}') | |
| # # upload the documents to the vector database | |
| # try: | |
| # response = ( | |
| # supabase.table("documents_agent") | |
| # .insert(docs) | |
| # .execute() | |
| # ) | |
| # except Exception as exception: | |
| # print("Error inserting data into Supabase:", exception) | |
| #Check data in table and setup vectorstore | |
| # add items to vector database | |
| vector_store = SupabaseVectorStore( | |
| client=supabase, | |
| embedding= embeddings, | |
| table_name="documents_agent", | |
| query_name="match_documents", | |
| ) | |
| retriever = vector_store.as_retriever() | |
| # query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?" | |
| # # matched_docs = vector_store.similarity_search(query, k=2) | |
| # retrived_docs = retriever.invoke(query) | |
| # print(retrived_docs[0]) |