{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "1b0c87d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "with open('metadata.jsonl', 'r') as f: \n",
    "    json_list = list(f)\n",
    "\n",
    "json_QA = []\n",
    "for json_str in json_list: \n",
    "    json_data = json.loads(json_str)\n",
    "    json_QA.append(json_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "06ef3470",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================================================\n",
      "Task ID: 305ac316-eef6-4446-960a-92d80d542f82\n",
      "Question: Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.\n",
      "Level: 1\n",
      "Final Answer: Wojciech\n",
      "Annotator Metadata: \n",
      "  ├── Steps: \n",
      "  │      ├── 1. Search \"Polish-language version of Everybody Loves Raymond\" and pull up the Wiki page for Wszyscy kochają Romana.\n",
      "  │      ├── 2. See that Bartłomiej Kasprzykowski is marked as playing Ray and go to his Wiki page.\n",
      "  │      ├── 3. See that he is stated to have played Wojciech Płaska in Magda M.\n",
      "  ├── Number of steps: 3\n",
      "  ├── How long did this take?: 5 minutes\n",
      "  ├── Tools:\n",
      "  │      ├── None\n",
      "  └── Number of tools: 0\n",
      "==================================================\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "random_samples = random.sample(json_QA, 1)\n",
    "for sample in random_samples:\n",
    "    print(\"=\" * 50)\n",
    "    print(f\"Task ID: {sample['task_id']}\")\n",
    "    print(f\"Question: {sample['Question']}\")\n",
    "    print(f\"Level: {sample['Level']}\")\n",
    "    print(f\"Final Answer: {sample['Final answer']}\")\n",
    "    print(f\"Annotator Metadata: \")\n",
    "    print(f\"  ├── Steps: \")\n",
    "    for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n",
    "        print(f\"  │      ├── {step}\")\n",
    "    print(f\"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n",
    "    print(f\"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n",
    "    print(f\"  ├── Tools:\")\n",
    "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
    "        print(f\"  │      ├── {tool}\")\n",
    "    print(f\"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n",
    "print(\"=\" * 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "468f83d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "from langchain_community.vectorstores import SupabaseVectorStore\n",
    "from supabase.client import Client, create_client\n",
    "\n",
    "\n",
    "load_dotenv()\n",
    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") #  dim=768\n",
    "\n",
    "supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
    "supabase_key = os.environ.get(\"SUPABASE_SERVICE_KEY\")\n",
    "supabase: Client = create_client(supabase_url, supabase_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c76e4a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.schema import Document\n",
    "docs = []\n",
    "cnt = 0 \n",
    "for sample in json_QA:\n",
    "    content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n",
    "    doc = {\n",
    "        \"id\" : cnt,\n",
    "        \"content\" : content,\n",
    "        \"metadata\" : {\n",
    "            \"source\" : sample['task_id']\n",
    "        },\n",
    "        \"embedding\" : embeddings.embed_query(content),\n",
    "    }\n",
    "    docs.append(doc)\n",
    "    cnt += 1\n",
    "\n",
    "#print(f\"sample document: {docs[0]}\")\n",
    "# upload the documents to the vector database\n",
    "try:\n",
    "    response = (\n",
    "        supabase.table(\"documents\")\n",
    "        .insert(docs)\n",
    "        .execute()\n",
    "    )\n",
    "except Exception as exception:\n",
    "    print(\"Error inserting data into Supabase:\", exception)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1d6fa354",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add items to vector database\n",
    "vector_store = SupabaseVectorStore(\n",
    "    client=supabase,\n",
    "    embedding= embeddings,\n",
    "    table_name=\"documents\",\n",
    "    query_name=\"match_documents_langchain\",\n",
    ")\n",
    "retriever = vector_store.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "aed648be",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
    "#matched_docs = vector_store.similarity_search(query, k=2)\n",
    "docs = retriever.invoke(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "a70e6e83",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer : 80GSFC21M0002')"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87c3bf86",
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of the tools used in all the samples\n",
    "from collections import Counter, OrderedDict\n",
    "\n",
    "tools = []\n",
    "for sample in json_QA:\n",
    "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
    "        tool = tool[2:].strip().lower()\n",
    "        if tool.startswith(\"(\"):\n",
    "            tool = tool[11:].strip()\n",
    "        tools.append(tool)\n",
    "tools_counter = OrderedDict(Counter(tools))\n",
    "print(\"List of tools used in all samples:\")\n",
    "print(\"Total number of tools used:\", len(tools_counter))\n",
    "for tool, count in tools_counter.items():\n",
    "    print(f\"  ├── {tool}: {count}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hf-cert",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}