{ "cells": [ { "cell_type": "code", "execution_count": 19, "id": "1b0c87d2", "metadata": {}, "outputs": [], "source": [ "import json \n", "with open('metadata.jsonl', 'r') as f: \n", " json_list = list(f)\n", "\n", "json_QA = []\n", "for json_str in json_list: \n", " json_data = json.loads(json_str)\n", " json_QA.append(json_data)" ] }, { "cell_type": "code", "execution_count": 20, "id": "06ef3470", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==================================================\n", "Task ID: 305ac316-eef6-4446-960a-92d80d542f82\n", "Question: Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.\n", "Level: 1\n", "Final Answer: Wojciech\n", "Annotator Metadata: \n", " ├── Steps: \n", " │ ├── 1. Search \"Polish-language version of Everybody Loves Raymond\" and pull up the Wiki page for Wszyscy kochają Romana.\n", " │ ├── 2. See that Bartłomiej Kasprzykowski is marked as playing Ray and go to his Wiki page.\n", " │ ├── 3. See that he is stated to have played Wojciech Płaska in Magda M.\n", " ├── Number of steps: 3\n", " ├── How long did this take?: 5 minutes\n", " ├── Tools:\n", " │ ├── None\n", " └── Number of tools: 0\n", "==================================================\n" ] } ], "source": [ "import random\n", "random_samples = random.sample(json_QA, 1)\n", "for sample in random_samples:\n", " print(\"=\" * 50)\n", " print(f\"Task ID: {sample['task_id']}\")\n", " print(f\"Question: {sample['Question']}\")\n", " print(f\"Level: {sample['Level']}\")\n", " print(f\"Final Answer: {sample['Final answer']}\")\n", " print(f\"Annotator Metadata: \")\n", " print(f\" ├── Steps: \")\n", " for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n", " print(f\" │ ├── {step}\")\n", " print(f\" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n", " print(f\" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n", " print(f\" ├── Tools:\")\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " print(f\" │ ├── {tool}\")\n", " print(f\" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n", "print(\"=\" * 50)" ] }, { "cell_type": "code", "execution_count": null, "id": "468f83d4", "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_community.vectorstores import SupabaseVectorStore\n", "from supabase.client import Client, create_client\n", "\n", "\n", "load_dotenv()\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") # dim=768\n", "\n", "supabase_url = os.environ.get(\"SUPABASE_URL\")\n", "supabase_key = os.environ.get(\"SUPABASE_SERVICE_KEY\")\n", "supabase: Client = create_client(supabase_url, supabase_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "0c76e4a1", "metadata": {}, "outputs": [], "source": [ "from langchain.schema import Document\n", "docs = []\n", "cnt = 0 \n", "for sample in json_QA:\n", " content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n", " doc = {\n", " \"id\" : cnt,\n", " \"content\" : content,\n", " \"metadata\" : {\n", " \"source\" : sample['task_id']\n", " },\n", " \"embedding\" : embeddings.embed_query(content),\n", " }\n", " docs.append(doc)\n", " cnt += 1\n", "\n", "#print(f\"sample document: {docs[0]}\")\n", "# upload the documents to the vector database\n", "try:\n", " response = (\n", " supabase.table(\"documents\")\n", " .insert(docs)\n", " .execute()\n", " )\n", "except Exception as exception:\n", " print(\"Error inserting data into Supabase:\", exception)" ] }, { "cell_type": "code", "execution_count": 23, "id": "1d6fa354", "metadata": {}, "outputs": [], "source": [ "# add items to vector database\n", "vector_store = SupabaseVectorStore(\n", " client=supabase,\n", " embedding= embeddings,\n", " table_name=\"documents\",\n", " query_name=\"match_documents_langchain\",\n", ")\n", "retriever = vector_store.as_retriever()" ] }, { "cell_type": "code", "execution_count": 24, "id": "aed648be", "metadata": {}, "outputs": [], "source": [ "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n", "#matched_docs = vector_store.similarity_search(query, k=2)\n", "docs = retriever.invoke(query)" ] }, { "cell_type": "code", "execution_count": 25, "id": "a70e6e83", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer : 80GSFC21M0002')" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "87c3bf86", "metadata": {}, "outputs": [], "source": [ "# list of the tools used in all the samples\n", "from collections import Counter, OrderedDict\n", "\n", "tools = []\n", "for sample in json_QA:\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " tool = tool[2:].strip().lower()\n", " if tool.startswith(\"(\"):\n", " tool = tool[11:].strip()\n", " tools.append(tool)\n", "tools_counter = OrderedDict(Counter(tools))\n", "print(\"List of tools used in all samples:\")\n", "print(\"Total number of tools used:\", len(tools_counter))\n", "for tool, count in tools_counter.items():\n", " print(f\" ├── {tool}: {count}\")" ] } ], "metadata": { "kernelspec": { "display_name": "hf-cert", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }