{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3X2X_7DoyGdH"
      },
      "source": [
        "# Create and run a local RAG pipeline from scratch\n",
        "\n",
        "The goal of this notebook is to build a RAG (Retrieval Augmented Generation) pipeline from scratch.\n",
        "\n",
        "Specifically, we'd like to be able to open a PDF file, ask questions (queries) of it and have them answered by a Large Language Model (LLM)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3YE8fl97yGdO"
      },
      "source": [
        "## Requirements and setup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "PhpZmm-3yGdO",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "50ba27d4-bab7-4798-976a-272cdd0ff2d0"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[INFO] Running in Google Colab, installing requirements.\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.4.0+cu121)\n",
            "Collecting torch\n",
            "  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.16.0)\n",
            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.2)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2024.6.1)\n",
            "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)\n",
            "  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)\n",
            "  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)\n",
            "  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)\n",
            "  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)\n",
            "  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)\n",
            "  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-curand-cu12==10.3.2.106 (from torch)\n",
            "  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch)\n",
            "  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch)\n",
            "  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-nccl-cu12==2.20.5 (from torch)\n",
            "  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n",
            "Collecting nvidia-nvtx-cu12==12.1.105 (from torch)\n",
            "  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n",
            "Collecting triton==3.0.0 (from torch)\n",
            "  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)\n",
            "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch)\n",
            "  Downloading nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n",
            "Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl (797.1 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m797.1/797.1 MB\u001b[0m \u001b[31m842.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m64.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m37.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.2/176.2 MB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.4/209.4 MB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_x86_64.whl (19.7 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.7/19.7 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: triton, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch\n",
            "  Attempting uninstall: nvidia-nccl-cu12\n",
            "    Found existing installation: nvidia-nccl-cu12 2.23.4\n",
            "    Uninstalling nvidia-nccl-cu12-2.23.4:\n",
            "      Successfully uninstalled nvidia-nccl-cu12-2.23.4\n",
            "  Attempting uninstall: torch\n",
            "    Found existing installation: torch 2.4.0+cu121\n",
            "    Uninstalling torch-2.4.0+cu121:\n",
            "      Successfully uninstalled torch-2.4.0+cu121\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "torchaudio 2.4.0+cu121 requires torch==2.4.0, but you have torch 2.4.1 which is incompatible.\n",
            "torchvision 0.19.0+cu121 requires torch==2.4.0, but you have torch 2.4.1 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0mSuccessfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.6.68 nvidia-nvtx-cu12-12.1.105 torch-2.4.1 triton-3.0.0\n",
            "Collecting PyMuPDF\n",
            "  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
            "Collecting PyMuPDFb==1.24.10 (from PyMuPDF)\n",
            "  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)\n",
            "Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m102.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: PyMuPDFb, PyMuPDF\n",
            "Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.66.5)\n",
            "Collecting cohere\n",
            "  Downloading cohere-5.9.2-py3-none-any.whl.metadata (3.4 kB)\n",
            "Collecting pinecone-client\n",
            "  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)\n",
            "Collecting boto3<2.0.0,>=1.34.0 (from cohere)\n",
            "  Downloading boto3-1.35.20-py3-none-any.whl.metadata (6.6 kB)\n",
            "Collecting fastavro<2.0.0,>=1.9.4 (from cohere)\n",
            "  Downloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)\n",
            "Collecting httpx>=0.21.2 (from cohere)\n",
            "  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n",
            "Collecting httpx-sse==0.4.0 (from cohere)\n",
            "  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n",
            "Collecting parameterized<0.10.0,>=0.9.0 (from cohere)\n",
            "  Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)\n",
            "Requirement already satisfied: pydantic>=1.9.2 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.9.1)\n",
            "Requirement already satisfied: pydantic-core<3.0.0,>=2.18.2 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.23.3)\n",
            "Requirement already satisfied: requests<3.0.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.32.3)\n",
            "Requirement already satisfied: tokenizers<1,>=0.15 in /usr/local/lib/python3.10/dist-packages (from cohere) (0.19.1)\n",
            "Collecting types-requests<3.0.0,>=2.0.0 (from cohere)\n",
            "  Downloading types_requests-2.32.0.20240914-py3-none-any.whl.metadata (1.9 kB)\n",
            "Requirement already satisfied: typing_extensions>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (4.12.2)\n",
            "Requirement already satisfied: certifi>=2019.11.17 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2024.8.30)\n",
            "Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)\n",
            "  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)\n",
            "Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)\n",
            "  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)\n",
            "Requirement already satisfied: tqdm>=4.64.1 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (4.66.5)\n",
            "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2.0.7)\n",
            "Collecting botocore<1.36.0,>=1.35.20 (from boto3<2.0.0,>=1.34.0->cohere)\n",
            "  Downloading botocore-1.35.20-py3-none-any.whl.metadata (5.7 kB)\n",
            "Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.34.0->cohere)\n",
            "  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n",
            "Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.34.0->cohere)\n",
            "  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)\n",
            "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.21.2->cohere) (3.7.1)\n",
            "Collecting httpcore==1.* (from httpx>=0.21.2->cohere)\n",
            "  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n",
            "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx>=0.21.2->cohere) (3.8)\n",
            "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.21.2->cohere) (1.3.1)\n",
            "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.21.2->cohere)\n",
            "  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.9.2->cohere) (0.7.0)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->cohere) (3.3.2)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from tokenizers<1,>=0.15->cohere) (0.24.7)\n",
            "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.10/dist-packages (from botocore<1.36.0,>=1.35.20->boto3<2.0.0,>=1.34.0->cohere) (2.8.2)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (3.16.0)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (2024.6.1)\n",
            "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (24.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (6.0.2)\n",
            "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx>=0.21.2->cohere) (1.2.2)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.36.0,>=1.35.20->boto3<2.0.0,>=1.34.0->cohere) (1.16.0)\n",
            "Downloading cohere-5.9.2-py3-none-any.whl (222 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.4/222.4 kB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n",
            "Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.8/244.8 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading boto3-1.35.20-py3-none-any.whl (139 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m97.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading parameterized-0.9.0-py2.py3-none-any.whl (20 kB)\n",
            "Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.4/85.4 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)\n",
            "Downloading types_requests-2.32.0.20240914-py3-none-any.whl (15 kB)\n",
            "Downloading botocore-1.35.20-py3-none-any.whl (12.5 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m104.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
            "Downloading s3transfer-0.10.2-py3-none-any.whl (82 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: types-requests, pinecone-plugin-interface, parameterized, jmespath, httpx-sse, h11, fastavro, pinecone-plugin-inference, httpcore, botocore, s3transfer, pinecone-client, httpx, boto3, cohere\n",
            "Successfully installed boto3-1.35.20 botocore-1.35.20 cohere-5.9.2 fastavro-1.9.7 h11-0.14.0 httpcore-1.0.5 httpx-0.27.2 httpx-sse-0.4.0 jmespath-1.0.1 parameterized-0.9.0 pinecone-client-5.0.1 pinecone-plugin-inference-1.1.0 pinecone-plugin-interface-0.0.7 s3transfer-0.10.2 types-requests-2.32.0.20240914\n",
            "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.34.2)\n",
            "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.26.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (24.1)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0.2)\n",
            "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.4.1)\n",
            "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.24.7)\n",
            "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.4.5)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (3.16.0)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2024.6.1)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\n",
            "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.5)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.13.2)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.4)\n",
            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (9.1.0.70)\n",
            "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.3.1)\n",
            "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (11.0.2.54)\n",
            "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (10.3.2.106)\n",
            "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (11.4.5.107)\n",
            "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.0.106)\n",
            "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.20.5)\n",
            "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.0.0)\n",
            "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.10.0->accelerate) (12.6.68)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.8)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.8.30)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n",
            "Collecting bitsandbytes\n",
            "  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (2.4.1)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (1.26.4)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.16.0)\n",
            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (1.13.2)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.1.4)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (2024.6.1)\n",
            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (9.1.0.70)\n",
            "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.3.1)\n",
            "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (11.0.2.54)\n",
            "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (10.3.2.106)\n",
            "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (11.4.5.107)\n",
            "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.0.106)\n",
            "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (2.20.5)\n",
            "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
            "Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.0.0)\n",
            "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->bitsandbytes) (12.6.68)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->bitsandbytes) (2.1.5)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->bitsandbytes) (1.3.0)\n",
            "Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.5/137.5 MB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: bitsandbytes\n",
            "Successfully installed bitsandbytes-0.43.3\n",
            "Collecting flash-attn\n",
            "  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from flash-attn) (2.4.1)\n",
            "Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages (from flash-attn) (0.8.0)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.16.0)\n",
            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (1.13.2)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.1.4)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2024.6.1)\n",
            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (9.1.0.70)\n",
            "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.3.1)\n",
            "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (11.0.2.54)\n",
            "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (10.3.2.106)\n",
            "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (11.4.5.107)\n",
            "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.0.106)\n",
            "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2.20.5)\n",
            "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
            "Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.0.0)\n",
            "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->flash-attn) (12.6.68)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->flash-attn) (2.1.5)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->flash-attn) (1.3.0)\n",
            "Building wheels for collected packages: flash-attn\n",
            "  Building wheel for flash-attn (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24\n",
            "  Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826\n",
            "Successfully built flash-attn\n",
            "Installing collected packages: flash-attn\n",
            "Successfully installed flash-attn-2.6.3\n"
          ]
        }
      ],
      "source": [
        "# Perform Google Colab installs (if running in Google Colab)\n",
        "import os\n",
        "\n",
        "if \"COLAB_GPU\" in os.environ:\n",
        "    print(\"[INFO] Running in Google Colab, installing requirements.\")\n",
        "    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)\n",
        "    !pip install PyMuPDF # for reading PDFs with Python\n",
        "    !pip install tqdm # for progress bars\n",
        "    !pip install cohere pinecone-client # for embedding models and vector database\n",
        "    !pip install accelerate # for quantization model loading\n",
        "    !pip install bitsandbytes # for quantizing models (less storage space)\n",
        "    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DWlEFjkeyGdP"
      },
      "source": [
        "## 1. Document/Text Processing and Embedding Creation\n",
        "\n",
        "Ingredients:\n",
        "* PDF document of choice.\n",
        "* Embedding model Cohere.\n",
        "\n",
        "Steps:\n",
        "1. Import PDF document.\n",
        "2. Process text for embedding (e.g. split into chunks of sentences).\n",
        "3. Embed text chunks with embedding model.\n",
        "4. Save embeddings to vector database Pinecone."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "V4yEvLvRyGdP"
      },
      "source": [
        "### Import PDF Document\n",
        "\n",
        "We're going to work on the open-source PDF textbook [*Human Nutrition: 2020 Edition*](https://pressbooks.oer.hawaii.edu/humannutrition2/).\n",
        "\n",
        "There are several libraries to open PDFs with Python but I found that [PyMuPDF](https://github.com/pymupdf/pymupdf) works quite well.\n",
        "\n",
        "First we'll download the PDF if it doesn't exist."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "-ULZl6gVyGdQ",
        "outputId": "4b65c581-be7f-4b71-8b27-a791676e041a",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "File doesn't exist, downloading...\n",
            "The file has been downloaded and saved as human-nutrition-text.pdf\n"
          ]
        }
      ],
      "source": [
        "# Download PDF file\n",
        "import os\n",
        "import requests\n",
        "\n",
        "# Get PDF document\n",
        "pdf_path = \"human-nutrition-text.pdf\"\n",
        "\n",
        "# Download PDF if it doesn't already exist\n",
        "if not os.path.exists(pdf_path):\n",
        "  print(\"File doesn't exist, downloading...\")\n",
        "\n",
        "  # The URL of the PDF you want to download\n",
        "  url = \"https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf\"\n",
        "\n",
        "  # The local filename to save the downloaded file\n",
        "  filename = pdf_path\n",
        "\n",
        "  # Send a GET request to the URL\n",
        "  response = requests.get(url)\n",
        "\n",
        "  # Check if the request was successful\n",
        "  if response.status_code == 200:\n",
        "      # Open a file in binary write mode and save the content to it\n",
        "      with open(filename, \"wb\") as file:\n",
        "          file.write(response.content)\n",
        "      print(f\"The file has been downloaded and saved as {filename}\")\n",
        "  else:\n",
        "      print(f\"Failed to download the file. Status code: {response.status_code}\")\n",
        "else:\n",
        "  print(f\"File {pdf_path} exists.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1jNab_TuyGdQ"
      },
      "source": [
        "PDF acquired!\n",
        "\n",
        "We can import the pages of our PDF to text by first defining the PDF path and then opening and reading it with PyMuPDF (`import fitz`).\n",
        "\n",
        "We'll write a small helper function to preprocess the text as it gets read.\n",
        "\n",
        "We'll save each page to a dictionary and then append that dictionary to a list for ease of use later."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "texuMJJKyGdQ",
        "outputId": "2dd5f8f9-e07f-4fac-a910-b282636c9cf1",
        "colab": {
          "referenced_widgets": [
            "662d657066044012ba174d1a7a993aa7",
            "b123cfb995154c5982344f77c7d97118",
            "28e55837d06840efbe1a70b768d165b2",
            "5fe003b2e9f841b8b1f34bdcce4569bd",
            "2bc69c558d664d5194582deb344b3ae6",
            "474dfac0c13d49e0a603f73369dcc42e",
            "0d2acff2bdd54e8b9fddfaf685144ae7",
            "cbf9ece884a24a7db323757c719a5ae8",
            "924c014aca3741aa808b705aa66864ed",
            "b72b7ebaaa0744699ba2e577a4911b59",
            "9a3b5eddeed443aba5908ad2f4e3b588"
          ],
          "base_uri": "https://localhost:8080/",
          "height": 257
        }
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "0it [00:00, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "662d657066044012ba174d1a7a993aa7"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'page_number': -42,\n",
              "  'page_char_count': 29,\n",
              "  'page_word_count': 4,\n",
              "  'page_sentence_count_raw': 1,\n",
              "  'page_token_count': 7.25,\n",
              "  'text': 'Human Nutrition: 2020 Edition'},\n",
              " {'page_number': -41,\n",
              "  'page_char_count': 0,\n",
              "  'page_word_count': 1,\n",
              "  'page_sentence_count_raw': 1,\n",
              "  'page_token_count': 0.0,\n",
              "  'text': ''}]"
            ]
          },
          "metadata": {},
          "execution_count": 3
        }
      ],
      "source": [
        "import fitz  # PyMuPDF\n",
        "from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm\n",
        "\n",
        "def text_formatter(text: str) -> str:\n",
        "    \"\"\"Performs minor formatting on text.\"\"\"\n",
        "    cleaned_text = text.replace(\"\\n\", \" \").strip() # this might be different for each doc\n",
        "\n",
        "    # Other potential text formatting functions can go here\n",
        "    return cleaned_text\n",
        "\n",
        "# Open PDF and get lines/pages\n",
        "# this only focuses on text\n",
        "def open_and_read_pdf(pdf_path: str, page_offset: int = 0) -> list[dict]:\n",
        "    \"\"\"\n",
        "    Opens a PDF file, reads its text content page by page, and collects statistics.\n",
        "\n",
        "    Parameters:\n",
        "        pdf_path (str): The file path to the PDF document to be opened and read.\n",
        "\n",
        "    Returns:\n",
        "        list[dict]: A list of dictionaries, each containing the page number\n",
        "        (adjusted), character count, word count, sentence count, token count, and the extracted text\n",
        "        for each page.\n",
        "    \"\"\"\n",
        "    doc = fitz.open(pdf_path)  # open a document\n",
        "    pages_and_texts = []\n",
        "    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages\n",
        "        text = page.get_text()  # get plain text encoded as UTF-8\n",
        "        text = text_formatter(text)\n",
        "        pages_and_texts.append({\"page_number\": page_number - page_offset,  # adjust page numbers since our PDF starts on page 42\n",
        "                                \"page_char_count\": len(text),\n",
        "                                \"page_word_count\": len(text.split(\" \")),\n",
        "                                \"page_sentence_count_raw\": len(text.split(\". \")),\n",
        "                                \"page_token_count\": len(text) / 4,  # 1 token = ~4 chars\n",
        "                                \"text\": text})\n",
        "    return pages_and_texts\n",
        "\n",
        "pages_and_texts = open_and_read_pdf(pdf_path=pdf_path, page_offset=42)\n",
        "pages_and_texts[:2]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "drqYJA2cyGdR"
      },
      "source": [
        "Now let's get a random sample of the pages."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "HTvdS7WryGdR",
        "outputId": "70be460a-0317-4f1c-99cd-8e6fe995bea5",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'page_number': 591,\n",
              "  'page_char_count': 313,\n",
              "  'page_word_count': 52,\n",
              "  'page_sentence_count_raw': 3,\n",
              "  'page_token_count': 78.25,\n",
              "  'text': 'recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \\xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=347  \\xa0 592  |  Water-Soluble Vitamins'},\n",
              " {'page_number': 1126,\n",
              "  'page_char_count': 1533,\n",
              "  'page_word_count': 218,\n",
              "  'page_sentence_count_raw': 25,\n",
              "  'page_token_count': 383.25,\n",
              "  'text': 'an incessant fear of weight gain but instead have an obsession with  “feeling pure, healthy and natural.”7 People affected by orthorexia  nervosa tend to follow diets tied to a philosophy or theory and  believe that their theory of eating is the best.8 9 Such diets often  have a redemptive quality that involves denying oneself of “bad” or  “wrong” foods.10 In extreme cases, affected individuals may also fear  contamination or harm from water and electricity leading them to  use filters to purify their environment from electrical emissions.  7.\\xa0Mathieu J. (2005). What is orthorexia? Journal of the  American Dietetic Association,\\xa0105(10), 1510-1512.  Bratman, S. Health Food Junkie. Yoga Journal. 1997,\\xa0 September/October, 42-50. Available at  https://www.orthorexia.com/original-orthorexia- essay/.  8.\\xa0Donini LM, Marsili D, Graziani MP, Imbriale M, Cannella  C. (2004). Orthorexia nervosa: a preliminary study with a  proposal for diagnosis and an attempt to measure the  dimension of the phenomenon. Eating and Weight  Disorders,\\xa09(2), 151‐157.  9.\\xa0Orthorexia. (2017, February 26). National Eating  Disorders Association.  https://www.nationaleatingdisorders.org/learn/by- eating-disorder/other/orthorexia  10.\\xa0Mathieu J. (2005). What is orthorexia? Journal of the  American Dietetic Association,\\xa0105(10), 1510-1512.  Bratman, S. Health Food Junkie. Yoga Journal. 1997,\\xa0 September/October, 42-50. Available at  https://www.orthorexia.com/original-orthorexia- essay/.  Undernutrition, Overnutrition, and Malnutrition  |  1127'},\n",
              " {'page_number': 895,\n",
              "  'page_char_count': 1535,\n",
              "  'page_word_count': 250,\n",
              "  'page_sentence_count_raw': 13,\n",
              "  'page_token_count': 383.75,\n",
              "  'text': 'There are a number of reasons behind this problem, including:  • larger portion sizes  • limited access to nutrient-rich foods  • increased access to fast foods and vending machines  • lack of breastfeeding support  • declining physical education programs in schools  • insufficient physical activity and a sedentary lifestyle  • media messages encouraging the consumption of unhealthy  foods  Children who suffer from obesity are more likely to become  overweight or obese adults. Obesity has a profound effect on self- esteem, energy, and activity level. Even more importantly, it is a  major risk factor for a number of diseases later in life, including  cardiovascular disease, Type 2 diabetes, stroke, hypertension, and  certain cancers.6  A percentile for body mass index (BMI) specific to age and sex is  used to determine if a child is overweight or obese. This is more  appropriate than the BMI categories used for adults because the  body composition of children varies as they develop, and differs  between boys and girls. If a child gains weight inappropriate to  growth, parents and caregivers should limit energy-dense,  nutrient-poor snack foods. In addition, it is extremely beneficial  to increase a child’s physical activity and limit sedentary activities,  facts/epidemic-childhood-obesity. Accessed December  5, 2017.  6.\\xa0Obesity and Overweight Fact Sheet. World Health  Organization. http://www.who.int/mediacentre/ factsheets/fs311/en/. Updated October 2017. Accessed  November 29, 2017.  896  |  Late Adolescence'}]"
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ],
      "source": [
        "import random\n",
        "\n",
        "random.sample(pages_and_texts, k=3)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UmXFEWhRyGdR"
      },
      "source": [
        "### Get some stats on the text\n",
        "\n",
        "Let's perform a rough exploratory data analysis (EDA) to get an idea of the size of the texts (e.g. character counts, word counts etc) we're working with.\n",
        "\n",
        "The different sizes of texts will be a good indicator into how we should split our texts.\n",
        "\n",
        "For now, let's turn our list of dictionaries into a DataFrame and explore it."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "2dhR7orByGdR",
        "outputId": "bee274c1-3c66-44a1-a7fe-eb81578a9105",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   page_number  page_char_count  page_word_count  page_sentence_count_raw  \\\n",
              "0          -42               29                4                        1   \n",
              "1          -41                0                1                        1   \n",
              "2          -40              320               54                        1   \n",
              "3          -39              212               32                        1   \n",
              "4          -38              797              145                        2   \n",
              "\n",
              "   page_token_count                                               text  \n",
              "0              7.25                      Human Nutrition: 2020 Edition  \n",
              "1              0.00                                                     \n",
              "2             80.00  Human Nutrition: 2020  Edition  UNIVERSITY OF ...  \n",
              "3             53.00  Human Nutrition: 2020 Edition by University of...  \n",
              "4            199.25  Contents  Preface  University of Hawai‘i at Mā...  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-9bb49ed3-bda0-4390-8a1c-1c02bdbc768e\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>page_number</th>\n",
              "      <th>page_char_count</th>\n",
              "      <th>page_word_count</th>\n",
              "      <th>page_sentence_count_raw</th>\n",
              "      <th>page_token_count</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-42</td>\n",
              "      <td>29</td>\n",
              "      <td>4</td>\n",
              "      <td>1</td>\n",
              "      <td>7.25</td>\n",
              "      <td>Human Nutrition: 2020 Edition</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-41</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0.00</td>\n",
              "      <td></td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>-40</td>\n",
              "      <td>320</td>\n",
              "      <td>54</td>\n",
              "      <td>1</td>\n",
              "      <td>80.00</td>\n",
              "      <td>Human Nutrition: 2020  Edition  UNIVERSITY OF ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>-39</td>\n",
              "      <td>212</td>\n",
              "      <td>32</td>\n",
              "      <td>1</td>\n",
              "      <td>53.00</td>\n",
              "      <td>Human Nutrition: 2020 Edition by University of...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>-38</td>\n",
              "      <td>797</td>\n",
              "      <td>145</td>\n",
              "      <td>2</td>\n",
              "      <td>199.25</td>\n",
              "      <td>Contents  Preface  University of Hawai‘i at Mā...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9bb49ed3-bda0-4390-8a1c-1c02bdbc768e')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-9bb49ed3-bda0-4390-8a1c-1c02bdbc768e button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-9bb49ed3-bda0-4390-8a1c-1c02bdbc768e');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-d7164b1d-c91a-4a8e-911b-37f9944b8c2b\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-d7164b1d-c91a-4a8e-911b-37f9944b8c2b')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-d7164b1d-c91a-4a8e-911b-37f9944b8c2b button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 1208,\n  \"fields\": [\n    {\n      \"column\": \"page_number\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 348,\n        \"min\": -42,\n        \"max\": 1165,\n        \"num_unique_values\": 1208,\n        \"samples\": [\n          59,\n          712,\n          266\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_char_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 560,\n        \"min\": 0,\n        \"max\": 2308,\n        \"num_unique_values\": 883,\n        \"samples\": [\n          1742,\n          1077,\n          1003\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_word_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 95,\n        \"min\": 1,\n        \"max\": 429,\n        \"num_unique_values\": 349,\n        \"samples\": [\n          352,\n          78,\n          88\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_sentence_count_raw\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 6,\n        \"min\": 1,\n        \"max\": 32,\n        \"num_unique_values\": 30,\n        \"samples\": [\n          28,\n          24,\n          22\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_token_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 140.09556874248847,\n        \"min\": 0.0,\n        \"max\": 577.0,\n        \"num_unique_values\": 883,\n        \"samples\": [\n          435.5,\n          269.25,\n          250.75\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 1180,\n        \"samples\": [\n          \"Electrolytes Important for  Fluid Balance  UNIVERSITY OF HAWAI\\u2018I AT M\\u0100NOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Cells are about 75 percent water and blood plasma is about 95  percent water. Why then, does the water not flow from blood  plasma to cells? The force of water also known as hydrostatic  pressure  maintains  the  volumes  of  water  between  fluid  compartments against the force of all dissolved substances. The  concentration is the amount of particles in a set volume of water.  (Recall that individual solutes can differ in concentration between  the intracellular and extracellular fluids, but the total concentration  of all dissolved substances is equal.)  The force driving the water movement through the selectively  permeable membrane is the higher solute concentration on the  one side. Solutes at different concentrations on either side of a  selectively permeable membrane exert a force, called osmotic  pressure. The higher concentration of solutes on one side compared  to the other of the U-tube exerts osmotic pressure, pulling the  water to a higher volume on the side of the U-tube containing  more dissolved particles. When the osmotic pressure is equal to the  pressure of the water on the selectively permeable membrane, net  water movement stops (though it still diffuses back and forth at an  equal rate).  One equation exemplifying equal concentrations but different  volumes is the following  5 grams of glucose in 1 liter = 10 grams of glucose in 2 liters (5g/L =  5g/L)  The differences in concentrations of particular substances  provide concentration gradients that cells can use to perform work.  A concentration gradient is a form of potential energy, like water  172  |  Electrolytes Important for Fluid Balance\",\n          \"Units of Measure  UNIVERSITY OF HAWAI\\u2018I AT M\\u0100NOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  In  nutrition,  there  are  two  systems  of  commonly  used  measurements: Metric and US Customary. We need both because  the US won\\u2019t adopt the metric system completely.  The Metric and US Customary System  These are commonly used prefixes for the Metric System:  Micro- (\\u03bc) 1/1,000,000th (one millionth)  Milli- (m)  1/1000th (one thousandth)  Centi- (c)  1/100th (one hundredth)  Deci- (d)  1/10th (one tenth)  Kilo- (k)  1000x (one thousand times)  Mass  Metric System  US Customary System Conversions  Microgram (\\u03bcg) Ounce (oz)  1 oz = 28.35 g  Milligram (mg)  Pound (lb)  1 lb = 16 oz  Gram (g)  1 lb = 454 g  Kilogram (kg)  1 kg = 2.2 lbs  18  |  Units of Measure\",\n          \"activity level. For example, dental problems can lead to difficulties  with chewing and swallowing, which in turn can make it hard to  maintain a healthy diet. The use of dentures or the preparation of  pureed or chopped foods can help solve this problem. There also  is a decreased thirst response in the elderly, and the kidneys have  a decreased ability to concentrate urine, both of which can lead to  dehydration.  Sensory Issues  At about age sixty, taste buds begin to decrease in size and number.  As a result, the taste threshold is higher in older adults, meaning  that more of the same flavor must be present to detect the taste.  Many elderly people lose the ability to distinguish between salty,  sour, sweet, and bitter flavors. This can make food seem less  appealing and decrease the appetite. An intake of foods high in  sugar and sodium can increase due to an inability to discern those  tastes. The sense of smell also decreases, which impacts attitudes  toward food. Sensory issues may also affect the digestion because  the taste and smell of food stimulates the secretion of  digestive\\u00a0enzymes in the mouth, stomach, and pancreas.  Dysphagia  Some older adults have difficulty getting adequate nutrition  because of the disorder dysphagia, which impairs the ability to  swallow. Any damage to the parts of the brain that control  swallowing can result in dysphagia, therefore stroke is a common  cause. Dysphagia is also associated with advanced dementia  because of overall brain function impairment. To assist older adults  suffering from dysphagia, it can be helpful to alter food consistency.  Older Adulthood: The Golden Years  |  923\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 5
        }
      ],
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.DataFrame(pages_and_texts)\n",
        "df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "id": "_2Mg-5k_yGdR",
        "outputId": "bb5fbcc0-329e-4bec-baf6-26252f6e0be9",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 300
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "       page_number  page_char_count  page_word_count  page_sentence_count_raw  \\\n",
              "count      1208.00          1208.00          1208.00                  1208.00   \n",
              "mean        561.50          1148.00           198.30                     9.97   \n",
              "std         348.86           560.38            95.76                     6.19   \n",
              "min         -42.00             0.00             1.00                     1.00   \n",
              "25%         259.75           762.00           134.00                     4.00   \n",
              "50%         561.50          1231.50           214.50                    10.00   \n",
              "75%         863.25          1603.50           271.00                    14.00   \n",
              "max        1165.00          2308.00           429.00                    32.00   \n",
              "\n",
              "       page_token_count  \n",
              "count           1208.00  \n",
              "mean             287.00  \n",
              "std              140.10  \n",
              "min                0.00  \n",
              "25%              190.50  \n",
              "50%              307.88  \n",
              "75%              400.88  \n",
              "max              577.00  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-609ec24f-e72b-424e-a9be-ca4929462465\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>page_number</th>\n",
              "      <th>page_char_count</th>\n",
              "      <th>page_word_count</th>\n",
              "      <th>page_sentence_count_raw</th>\n",
              "      <th>page_token_count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>561.50</td>\n",
              "      <td>1148.00</td>\n",
              "      <td>198.30</td>\n",
              "      <td>9.97</td>\n",
              "      <td>287.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>348.86</td>\n",
              "      <td>560.38</td>\n",
              "      <td>95.76</td>\n",
              "      <td>6.19</td>\n",
              "      <td>140.10</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>-42.00</td>\n",
              "      <td>0.00</td>\n",
              "      <td>1.00</td>\n",
              "      <td>1.00</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>259.75</td>\n",
              "      <td>762.00</td>\n",
              "      <td>134.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>190.50</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>561.50</td>\n",
              "      <td>1231.50</td>\n",
              "      <td>214.50</td>\n",
              "      <td>10.00</td>\n",
              "      <td>307.88</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>863.25</td>\n",
              "      <td>1603.50</td>\n",
              "      <td>271.00</td>\n",
              "      <td>14.00</td>\n",
              "      <td>400.88</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>1165.00</td>\n",
              "      <td>2308.00</td>\n",
              "      <td>429.00</td>\n",
              "      <td>32.00</td>\n",
              "      <td>577.00</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-609ec24f-e72b-424e-a9be-ca4929462465')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-609ec24f-e72b-424e-a9be-ca4929462465 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-609ec24f-e72b-424e-a9be-ca4929462465');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-777091aa-4243-437a-aa9f-b6dee5f6c838\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-777091aa-4243-437a-aa9f-b6dee5f6c838')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-777091aa-4243-437a-aa9f-b6dee5f6c838 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"page_number\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 439.1842002346103,\n        \"min\": -42.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 7,\n        \"samples\": [\n          1208.0,\n          561.5,\n          863.25\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_char_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 692.7598126695862,\n        \"min\": 0.0,\n        \"max\": 2308.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          1148.0,\n          1231.5,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_word_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 380.86345761027945,\n        \"min\": 1.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          198.3,\n          214.5,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_sentence_count_raw\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 423.3006680160771,\n        \"min\": 1.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          9.97,\n          10.0,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_token_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 373.38318073681745,\n        \"min\": 0.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          287.0,\n          307.88,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 6
        }
      ],
      "source": [
        "# Get stats\n",
        "df.describe().round(2)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jMNQgx4MyGdS"
      },
      "source": [
        "Okay, looks like our average token count per page is 287.\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VpMTeek-yGdS"
      },
      "source": [
        "### Further text processing (splitting pages into sentences)\n",
        "\n",
        "The ideal is processing text before embedding.\n",
        "\n",
        "A simple method is to break the text into chunks of sentences.\n",
        "\n",
        "As in, chunk a page of text into groups of 5, 7, 10 or more sentences.\n",
        "\n",
        "We will follow the workflow of:\n",
        "\n",
        "`Ingest text -> split it into groups/chunks -> embed the groups/chunks -> use the embeddings`\n",
        "\n",
        "Some options for splitting text into sentences:\n",
        "\n",
        "1. Split into sentences with simple rules (e.g. split on \". \" with `text = text.split(\". \")`, like we did above).\n",
        "2. Split into sentences with a natural language processing (NLP) library such as [spaCy](https://spacy.io/) or [nltk](https://www.nltk.org/).\n",
        "\n",
        "Why split into sentences?\n",
        "\n",
        "* Easier to handle than larger pages of text (especially if pages are densely filled with text).\n",
        "* Can get specific and find out which group of sentences were used to help within a RAG pipeline.\n",
        "\n",
        "> **Resource:** See [spaCy install instructions](https://spacy.io/usage).\n",
        "\n",
        "Let's use spaCy to break our text into sentences since it's likely a bit more robust than just using `text.split(\". \")`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "-yz9rHMEyGdS",
        "outputId": "1105c552-197b-497f-c914-ab684bee7ba4",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[This is a sentence., This another sentence.]"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ],
      "source": [
        "from spacy.lang.en import English  # English language model\n",
        "\n",
        "nlp = English()\n",
        "\n",
        "# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/\n",
        "nlp.add_pipe(\"sentencizer\")\n",
        "\n",
        "# Create a document instance as an example\n",
        "doc = nlp(\"This is a sentence. This another sentence.\")\n",
        "assert len(list(doc.sents)) == 2\n",
        "\n",
        "# Access the sentences of the document\n",
        "list(doc.sents)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Kzn8irI5yGdT"
      },
      "source": [
        "So let's run our small sentencizing pipeline on our pages of text."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "id": "rzj7cThayGdT",
        "outputId": "fd0ee782-8189-4a2b-b985-2196539e0609",
        "colab": {
          "referenced_widgets": [
            "94cc52bff24e4c158e2859195a142498",
            "a069ef0cde334591a5dc9cc3f2a5aacf",
            "02e443fff37f4cceab1996ea4893dd3c",
            "d76e01440712420eb7249bf8e6fc1aaa",
            "f5809653a4a94e8f80452f1703dc47a5",
            "6792e77105e54799bdd471eec04c7c5d",
            "a84fdc3489ad4e78892906e5330e21f2",
            "80941748295241d2b712292647b3c623",
            "91bc9829243545008533e7a21bb44d8f",
            "a2adca3f8a484ab48b859922f93f380d",
            "86b0843d4fc4438fb312d40ad9447dcc"
          ],
          "base_uri": "https://localhost:8080/",
          "height": 49
        }
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "  0%|          | 0/1208 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "94cc52bff24e4c158e2859195a142498"
            }
          },
          "metadata": {}
        }
      ],
      "source": [
        "for item in tqdm(pages_and_texts):\n",
        "    item[\"sentences\"] = list(nlp(item[\"text\"]).sents)\n",
        "\n",
        "    # Make sure all sentences are strings\n",
        "    item[\"sentences\"] = [str(sentence) for sentence in item[\"sentences\"]]\n",
        "\n",
        "    # Count the sentences\n",
        "    item[\"page_sentence_count_spacy\"] = len(item[\"sentences\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "id": "XSuwY1o4yGdT",
        "outputId": "1fb87ef9-830e-4963-fde8-e3d8703f17d5",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'page_number': 578,\n",
              "  'page_char_count': 1284,\n",
              "  'page_word_count': 216,\n",
              "  'page_sentence_count_raw': 8,\n",
              "  'page_token_count': 321.0,\n",
              "  'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Folate is especially essential for the growth and specialization of  cells of the central nervous system. Children whose mothers were  folate-deficient during pregnancy have a higher risk of neural-tube  birth defects. Folate deficiency is causally linked to the development  of spina bifida, a neural-tube defect that occurs when the spine  does not completely enclose the spinal cord. Spina bifida can lead  to many physical and mental disabilities (Figure 9.18\\xa0“Spina Bifida in  Infants” ). Observational studies show that the prevalence of neural- tube defects was decreased after the fortification of enriched cereal  grain products with folate in 1996 in the United States (and 1998  in Canada) compared to before grain products were fortified with  folate.  Additionally, results of clinical trials have demonstrated that  neural-tube defects are significantly decreased in the offspring of  mothers who began taking folate supplements one month prior  to becoming pregnant and throughout the pregnancy. In response  to the scientific evidence, the Food and Nutrition Board of the  Institute of Medicine (IOM) raised the RDA for folate to 600  micrograms per day for pregnant women. Some were concerned  Water-Soluble Vitamins  |  579',\n",
              "  'sentences': ['Image by  Allison  Calabrese /  CC BY 4.0  Folate is especially essential for the growth and specialization of  cells of the central nervous system.',\n",
              "   'Children whose mothers were  folate-deficient during pregnancy have a higher risk of neural-tube  birth defects.',\n",
              "   'Folate deficiency is causally linked to the development  of spina bifida, a neural-tube defect that occurs when the spine  does not completely enclose the spinal cord.',\n",
              "   'Spina bifida can lead  to many physical and mental disabilities (Figure 9.18\\xa0“Spina Bifida in  Infants” ).',\n",
              "   'Observational studies show that the prevalence of neural- tube defects was decreased after the fortification of enriched cereal  grain products with folate in 1996 in the United States (and 1998  in Canada) compared to before grain products were fortified with  folate.',\n",
              "   ' Additionally, results of clinical trials have demonstrated that  neural-tube defects are significantly decreased in the offspring of  mothers who began taking folate supplements one month prior  to becoming pregnant and throughout the pregnancy.',\n",
              "   'In response  to the scientific evidence, the Food and Nutrition Board of the  Institute of Medicine (IOM) raised the RDA for folate to 600  micrograms per day for pregnant women.',\n",
              "   'Some were concerned  Water-Soluble Vitamins  |  579'],\n",
              "  'page_sentence_count_spacy': 8}]"
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ],
      "source": [
        "# Inspect an example\n",
        "random.sample(pages_and_texts, k=1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ci4M4fZ2yGdU"
      },
      "source": [
        "Wonderful!\n",
        "\n",
        "Now let's turn out list of dictionaries into a DataFrame and get some stats."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "c8uMnhkqyGdU",
        "outputId": "17753e0b-924b-4cf4-dff7-b6eae15909ab",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 300
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "       page_number  page_char_count  page_word_count  page_sentence_count_raw  \\\n",
              "count      1208.00          1208.00          1208.00                  1208.00   \n",
              "mean        561.50          1148.00           198.30                     9.97   \n",
              "std         348.86           560.38            95.76                     6.19   \n",
              "min         -42.00             0.00             1.00                     1.00   \n",
              "25%         259.75           762.00           134.00                     4.00   \n",
              "50%         561.50          1231.50           214.50                    10.00   \n",
              "75%         863.25          1603.50           271.00                    14.00   \n",
              "max        1165.00          2308.00           429.00                    32.00   \n",
              "\n",
              "       page_token_count  page_sentence_count_spacy  \n",
              "count           1208.00                    1208.00  \n",
              "mean             287.00                      10.32  \n",
              "std              140.10                       6.30  \n",
              "min                0.00                       0.00  \n",
              "25%              190.50                       5.00  \n",
              "50%              307.88                      10.00  \n",
              "75%              400.88                      15.00  \n",
              "max              577.00                      28.00  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-722932dc-d682-479f-b802-3678000aec8a\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>page_number</th>\n",
              "      <th>page_char_count</th>\n",
              "      <th>page_word_count</th>\n",
              "      <th>page_sentence_count_raw</th>\n",
              "      <th>page_token_count</th>\n",
              "      <th>page_sentence_count_spacy</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>561.50</td>\n",
              "      <td>1148.00</td>\n",
              "      <td>198.30</td>\n",
              "      <td>9.97</td>\n",
              "      <td>287.00</td>\n",
              "      <td>10.32</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>348.86</td>\n",
              "      <td>560.38</td>\n",
              "      <td>95.76</td>\n",
              "      <td>6.19</td>\n",
              "      <td>140.10</td>\n",
              "      <td>6.30</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>-42.00</td>\n",
              "      <td>0.00</td>\n",
              "      <td>1.00</td>\n",
              "      <td>1.00</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>259.75</td>\n",
              "      <td>762.00</td>\n",
              "      <td>134.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>190.50</td>\n",
              "      <td>5.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>561.50</td>\n",
              "      <td>1231.50</td>\n",
              "      <td>214.50</td>\n",
              "      <td>10.00</td>\n",
              "      <td>307.88</td>\n",
              "      <td>10.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>863.25</td>\n",
              "      <td>1603.50</td>\n",
              "      <td>271.00</td>\n",
              "      <td>14.00</td>\n",
              "      <td>400.88</td>\n",
              "      <td>15.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>1165.00</td>\n",
              "      <td>2308.00</td>\n",
              "      <td>429.00</td>\n",
              "      <td>32.00</td>\n",
              "      <td>577.00</td>\n",
              "      <td>28.00</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-722932dc-d682-479f-b802-3678000aec8a')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-722932dc-d682-479f-b802-3678000aec8a button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-722932dc-d682-479f-b802-3678000aec8a');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-edaf6b8e-42f0-49c8-80ce-a64c2fc2d62c\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-edaf6b8e-42f0-49c8-80ce-a64c2fc2d62c')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-edaf6b8e-42f0-49c8-80ce-a64c2fc2d62c button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"page_number\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 439.1842002346103,\n        \"min\": -42.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 7,\n        \"samples\": [\n          1208.0,\n          561.5,\n          863.25\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_char_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 692.7598126695862,\n        \"min\": 0.0,\n        \"max\": 2308.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          1148.0,\n          1231.5,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_word_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 380.86345761027945,\n        \"min\": 1.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          198.3,\n          214.5,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_sentence_count_raw\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 423.3006680160771,\n        \"min\": 1.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          9.97,\n          10.0,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_token_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 373.38318073681745,\n        \"min\": 0.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          287.0,\n          307.88,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_sentence_count_spacy\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 423.405400861363,\n        \"min\": 0.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          10.32,\n          10.0,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 10
        }
      ],
      "source": [
        "df = pd.DataFrame(pages_and_texts)\n",
        "df.describe().round(2)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4lZJbailyGdU"
      },
      "source": [
        "For our set of text, it looks like our raw sentence count (e.g. splitting on `\". \"`) is quite close to what spaCy came up with.\n",
        "\n",
        "Now we've got our text split into sentences, lets group those sentences!"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wAFfalXJyGdU"
      },
      "source": [
        "### Chunking our sentences together\n",
        "\n",
        "Let's take a step to break down our list of sentences/text into smaller chunks.\n",
        "\n",
        "This process is referred to as **chunking**.\n",
        "\n",
        "Why do we do this?\n",
        "\n",
        "1. Easier to manage similar sized chunks of text.\n",
        "2. Don't overload the embedding models capacity for tokens (e.g. if an embedding model has a capacity of 384 tokens, there could be information loss if we try to embed a sequence of 400+ tokens).\n",
        "3. Our LLM context window (the amount of tokens an LLM can take in) may be limited and requires compute power so we want to make sure we're using it as well as possible.\n",
        "\n",
        "There are many different ways emerging for creating chunks of information/text.\n",
        "\n",
        "For now, we're going to keep it simple and break our pages of sentences into groups of 10 (this number is arbitrary and can be changed).\n",
        "\n",
        "On average each of our pages has 10 sentences.\n",
        "\n",
        "And an average total of 287 tokens per page.\n",
        "\n",
        "So our groups of 10 sentences will also be ~287 tokens long.\n",
        "\n",
        "To split our groups of sentences into chunks of 10 or less, let's create a function which accepts a list as input and recursively breaks into down into sublists of a specified size."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "lgx5Dy_ayGdU",
        "outputId": "0c2cfa69-c02e-4f22-930a-1ca34df52705",
        "colab": {
          "referenced_widgets": [
            "1016a3c98b6448b7a11852633587ab3f",
            "74b707712fce45cd98a16b71908916db",
            "494fc75e07d14ca794c9b152e100953e",
            "20c69b545a6d4bbb86344abfea24da58",
            "8ffe48cbd52245ca849696746b9dec74",
            "711f9bc8e4fb4e5c90aced0384b4cb54",
            "67e2aafea7754c23906a8b35e9bd2f08",
            "2e86c19b91f2419e9eb181e105dfb3b0",
            "5a03c35e76ca47c5a973023980862c76",
            "58766a63fc7f4188b41ac32b386769f4",
            "1ce1ca10b5b0434aa17417b63db75ab4"
          ],
          "base_uri": "https://localhost:8080/",
          "height": 49
        }
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "  0%|          | 0/1208 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "1016a3c98b6448b7a11852633587ab3f"
            }
          },
          "metadata": {}
        }
      ],
      "source": [
        "# Define split size to turn groups of sentences into chunks\n",
        "num_sentence_chunk_size = 10\n",
        "\n",
        "# Create a function that recursively splits a list into desired sizes\n",
        "def split_list(input_list: list,\n",
        "               slice_size: int) -> list[list[str]]:\n",
        "    \"\"\"\n",
        "    Splits the input_list into sublists of size slice_size (or as close as possible).\n",
        "\n",
        "    For example, a list of 17 sentences would be split into two lists of [[10], [7]]\n",
        "    \"\"\"\n",
        "    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]\n",
        "\n",
        "# Loop through pages and texts and split sentences into chunks\n",
        "for item in tqdm(pages_and_texts):\n",
        "    item[\"sentence_chunks\"] = split_list(input_list=item[\"sentences\"],\n",
        "                                         slice_size=num_sentence_chunk_size)\n",
        "    item[\"num_chunks\"] = len(item[\"sentence_chunks\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "DU0Z43XayGdU",
        "outputId": "eb12fd92-90c1-4141-efa0-87a3be6a8b80",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'page_number': 684,\n",
              "  'page_char_count': 297,\n",
              "  'page_word_count': 51,\n",
              "  'page_sentence_count_raw': 3,\n",
              "  'page_token_count': 74.25,\n",
              "  'text': 'recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \\xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=393  \\xa0 Iodine  |  685',\n",
              "  'sentences': ['recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',\n",
              "   ' \\xa0 An interactive or media element has been  excluded from this version of the text.',\n",
              "   'You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=393  \\xa0 Iodine  |  685'],\n",
              "  'page_sentence_count_spacy': 3,\n",
              "  'sentence_chunks': [['recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',\n",
              "    ' \\xa0 An interactive or media element has been  excluded from this version of the text.',\n",
              "    'You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=393  \\xa0 Iodine  |  685']],\n",
              "  'num_chunks': 1}]"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ],
      "source": [
        "# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)\n",
        "random.sample(pages_and_texts, k=1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "yOlOwJj2yGdU",
        "outputId": "b24e5521-38c8-436d-857c-fdc044a938f0",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 300
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "       page_number  page_char_count  page_word_count  page_sentence_count_raw  \\\n",
              "count      1208.00          1208.00          1208.00                  1208.00   \n",
              "mean        561.50          1148.00           198.30                     9.97   \n",
              "std         348.86           560.38            95.76                     6.19   \n",
              "min         -42.00             0.00             1.00                     1.00   \n",
              "25%         259.75           762.00           134.00                     4.00   \n",
              "50%         561.50          1231.50           214.50                    10.00   \n",
              "75%         863.25          1603.50           271.00                    14.00   \n",
              "max        1165.00          2308.00           429.00                    32.00   \n",
              "\n",
              "       page_token_count  page_sentence_count_spacy  num_chunks  \n",
              "count           1208.00                    1208.00     1208.00  \n",
              "mean             287.00                      10.32        1.53  \n",
              "std              140.10                       6.30        0.64  \n",
              "min                0.00                       0.00        0.00  \n",
              "25%              190.50                       5.00        1.00  \n",
              "50%              307.88                      10.00        1.00  \n",
              "75%              400.88                      15.00        2.00  \n",
              "max              577.00                      28.00        3.00  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-f3b3b2ed-ba92-4cf5-b25b-a570edbae90f\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>page_number</th>\n",
              "      <th>page_char_count</th>\n",
              "      <th>page_word_count</th>\n",
              "      <th>page_sentence_count_raw</th>\n",
              "      <th>page_token_count</th>\n",
              "      <th>page_sentence_count_spacy</th>\n",
              "      <th>num_chunks</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "      <td>1208.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>561.50</td>\n",
              "      <td>1148.00</td>\n",
              "      <td>198.30</td>\n",
              "      <td>9.97</td>\n",
              "      <td>287.00</td>\n",
              "      <td>10.32</td>\n",
              "      <td>1.53</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>348.86</td>\n",
              "      <td>560.38</td>\n",
              "      <td>95.76</td>\n",
              "      <td>6.19</td>\n",
              "      <td>140.10</td>\n",
              "      <td>6.30</td>\n",
              "      <td>0.64</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>-42.00</td>\n",
              "      <td>0.00</td>\n",
              "      <td>1.00</td>\n",
              "      <td>1.00</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>259.75</td>\n",
              "      <td>762.00</td>\n",
              "      <td>134.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>190.50</td>\n",
              "      <td>5.00</td>\n",
              "      <td>1.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>561.50</td>\n",
              "      <td>1231.50</td>\n",
              "      <td>214.50</td>\n",
              "      <td>10.00</td>\n",
              "      <td>307.88</td>\n",
              "      <td>10.00</td>\n",
              "      <td>1.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>863.25</td>\n",
              "      <td>1603.50</td>\n",
              "      <td>271.00</td>\n",
              "      <td>14.00</td>\n",
              "      <td>400.88</td>\n",
              "      <td>15.00</td>\n",
              "      <td>2.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>1165.00</td>\n",
              "      <td>2308.00</td>\n",
              "      <td>429.00</td>\n",
              "      <td>32.00</td>\n",
              "      <td>577.00</td>\n",
              "      <td>28.00</td>\n",
              "      <td>3.00</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f3b3b2ed-ba92-4cf5-b25b-a570edbae90f')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-f3b3b2ed-ba92-4cf5-b25b-a570edbae90f button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-f3b3b2ed-ba92-4cf5-b25b-a570edbae90f');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-2bbcee31-5a5d-4345-be5f-fd7209fbe747\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-2bbcee31-5a5d-4345-be5f-fd7209fbe747')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-2bbcee31-5a5d-4345-be5f-fd7209fbe747 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"page_number\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 439.1842002346103,\n        \"min\": -42.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 7,\n        \"samples\": [\n          1208.0,\n          561.5,\n          863.25\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_char_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 692.7598126695862,\n        \"min\": 0.0,\n        \"max\": 2308.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          1148.0,\n          1231.5,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_word_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 380.86345761027945,\n        \"min\": 1.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          198.3,\n          214.5,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_sentence_count_raw\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 423.3006680160771,\n        \"min\": 1.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          9.97,\n          10.0,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_token_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 373.38318073681745,\n        \"min\": 0.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          287.0,\n          307.88,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"page_sentence_count_spacy\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 423.405400861363,\n        \"min\": 0.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          10.32,\n          10.0,\n          1208.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"num_chunks\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 426.6303015471525,\n        \"min\": 0.0,\n        \"max\": 1208.0,\n        \"num_unique_values\": 7,\n        \"samples\": [\n          1208.0,\n          1.53,\n          2.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 13
        }
      ],
      "source": [
        "# Create a DataFrame to get stats\n",
        "df = pd.DataFrame(pages_and_texts)\n",
        "df.describe().round(2)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eb9k-vsbyGdV"
      },
      "source": [
        "The average number of chunks is around 1.5, this is expected since many of our pages only contain an average of 10 sentences."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tgcgENBHyGdV"
      },
      "source": [
        "### Splitting each chunk into its own item\n",
        "\n",
        "We'd like to embed each chunk of sentences into its own numerical representation.\n",
        "\n",
        "Let's create a new list of dictionaries each containing a single chunk of sentences with relative information such as page number as well statistics about each chunk."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "UFSxO5iXyGdV",
        "outputId": "b7611c07-724f-4d86-8dd8-694533f56955",
        "colab": {
          "referenced_widgets": [
            "b19f39b1fd314d71885b15154debce11",
            "9f9948c49b5e4a5e87631d4137bdca3e",
            "4eaeda1e6d94473cb39ca9ad6204b607",
            "2cf3513015d243e7842f6120f939a706",
            "2f2f1c2e4df346cbb7b6775e0aa0c55a",
            "ebd0595644334e02abfbafd31264196e",
            "6ac351005e7247f391c684e620744259",
            "fe005c591c2448fb8f3f4f0bd5cc0bb1",
            "980ae4efbcb64f758e1311e4e2b0cad0",
            "14c9b952454c40b59325b63be4cc2d7e",
            "5a646e132031418d8cefea0a99f22444"
          ],
          "base_uri": "https://localhost:8080/",
          "height": 66
        }
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "  0%|          | 0/1208 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "b19f39b1fd314d71885b15154debce11"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1843"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ],
      "source": [
        "import re\n",
        "\n",
        "# Split each chunk into its own item\n",
        "pages_and_chunks = []\n",
        "for item in tqdm(pages_and_texts):\n",
        "    for sentence_chunk in item[\"sentence_chunks\"]:\n",
        "        chunk_dict = {}\n",
        "        chunk_dict[\"page_number\"] = item[\"page_number\"]\n",
        "\n",
        "        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)\n",
        "        joined_sentence_chunk = \"\".join(sentence_chunk).replace(\"  \", \" \").strip()\n",
        "        joined_sentence_chunk = re.sub(r'\\.([A-Z])', r'. \\1', joined_sentence_chunk) # \".A\" -> \". A\" for any full-stop/capital letter combo\n",
        "        chunk_dict[\"sentence_chunk\"] = joined_sentence_chunk\n",
        "\n",
        "        # Get stats about the chunk\n",
        "        chunk_dict[\"chunk_char_count\"] = len(joined_sentence_chunk)\n",
        "        chunk_dict[\"chunk_word_count\"] = len([word for word in joined_sentence_chunk.split(\" \")])\n",
        "        chunk_dict[\"chunk_token_count\"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters\n",
        "\n",
        "        pages_and_chunks.append(chunk_dict)\n",
        "\n",
        "# How many chunks do we have?\n",
        "len(pages_and_chunks)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "id": "9XZYLeXWyGdV",
        "outputId": "0b21ba26-2e7e-4a4e-b22c-383414eb1937",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'page_number': 803,\n",
              "  'sentence_chunk': 'and eclampsia, which is sometimes referred to as toxemia of pregnancy. This disorder is marked by elevated blood pressure and protein in the urine and is associated with swelling. To prevent preeclampsia, the WHO recommends increasing calcium intake for women consuming diets low in that micronutrient, administering a low dosage of aspirin (75 milligrams), and increasing prenatal checkups. The WHO does not recommend the restriction of dietary salt intake during pregnancy with the aim of preventing the development of pre-eclampsia and its complications12. About 4 percent of pregnant women suffer from a condition known as gestational diabetes, which is abnormal glucose tolerance during pregnancy. The body becomes resistant to the hormone insulin, which enables cells to transport glucose from the blood. Gestational diabetes is usually diagnosed around twenty-four to twenty-six weeks, although it is possible for the condition to develop later into a pregnancy. Signs and symptoms of this disease include extreme hunger, thirst, or fatigue. If blood sugar levels are not properly monitored and treated, the baby might gain too much weight and require a cesarean delivery. Diet and regular physical activity can help to manage this condition.',\n",
              "  'chunk_char_count': 1249,\n",
              "  'chunk_word_count': 189,\n",
              "  'chunk_token_count': 312.25}]"
            ]
          },
          "metadata": {},
          "execution_count": 15
        }
      ],
      "source": [
        "# View a random sample\n",
        "random.sample(pages_and_chunks, k=1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ME1zoPDryGdV"
      },
      "source": [
        "Excellent!\n",
        "\n",
        "Now we've broken our whole textbook into chunks of 10 sentences or less as well as the page number they came from.\n",
        "\n",
        "This means we could reference a chunk of text and know its source.\n",
        "\n",
        "Let's get some stats about our chunks."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "lzeLIECnyGdV",
        "outputId": "b923a384-39be-4435-9bad-735064d37d54",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 300
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "       page_number  chunk_char_count  chunk_word_count  chunk_token_count\n",
              "count      1843.00           1843.00           1843.00            1843.00\n",
              "mean        582.38            734.44            112.33             183.61\n",
              "std         347.79            447.54             71.22             111.89\n",
              "min         -42.00             12.00              3.00               3.00\n",
              "25%         279.50            315.00             44.00              78.75\n",
              "50%         585.00            746.00            114.00             186.50\n",
              "75%         889.00           1118.50            173.00             279.62\n",
              "max        1165.00           1831.00            297.00             457.75"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-5acfb31d-f24f-45a7-a23b-e5162690b593\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>page_number</th>\n",
              "      <th>chunk_char_count</th>\n",
              "      <th>chunk_word_count</th>\n",
              "      <th>chunk_token_count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>1843.00</td>\n",
              "      <td>1843.00</td>\n",
              "      <td>1843.00</td>\n",
              "      <td>1843.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>582.38</td>\n",
              "      <td>734.44</td>\n",
              "      <td>112.33</td>\n",
              "      <td>183.61</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>347.79</td>\n",
              "      <td>447.54</td>\n",
              "      <td>71.22</td>\n",
              "      <td>111.89</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>-42.00</td>\n",
              "      <td>12.00</td>\n",
              "      <td>3.00</td>\n",
              "      <td>3.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>279.50</td>\n",
              "      <td>315.00</td>\n",
              "      <td>44.00</td>\n",
              "      <td>78.75</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>585.00</td>\n",
              "      <td>746.00</td>\n",
              "      <td>114.00</td>\n",
              "      <td>186.50</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>889.00</td>\n",
              "      <td>1118.50</td>\n",
              "      <td>173.00</td>\n",
              "      <td>279.62</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>1165.00</td>\n",
              "      <td>1831.00</td>\n",
              "      <td>297.00</td>\n",
              "      <td>457.75</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5acfb31d-f24f-45a7-a23b-e5162690b593')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-5acfb31d-f24f-45a7-a23b-e5162690b593 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-5acfb31d-f24f-45a7-a23b-e5162690b593');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-296be9c1-d17b-4226-87dc-8291cf2c2ab5\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-296be9c1-d17b-4226-87dc-8291cf2c2ab5')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-296be9c1-d17b-4226-87dc-8291cf2c2ab5 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"page_number\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 589.9857395234833,\n        \"min\": -42.0,\n        \"max\": 1843.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          582.38,\n          585.0,\n          1843.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"chunk_char_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 674.7972839940261,\n        \"min\": 12.0,\n        \"max\": 1843.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          734.44,\n          746.0,\n          1843.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"chunk_word_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 616.9738147936844,\n        \"min\": 3.0,\n        \"max\": 1843.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          112.33,\n          114.0,\n          1843.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"chunk_token_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 601.8910558517096,\n        \"min\": 3.0,\n        \"max\": 1843.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          183.61,\n          186.5,\n          1843.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 16
        }
      ],
      "source": [
        "# Get stats about our chunks\n",
        "df = pd.DataFrame(pages_and_chunks)\n",
        "df.describe().round(2)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Z5TKYZ7hyGdV"
      },
      "source": [
        "Looks like some of our chunks have quite a low token count.\n",
        "\n",
        "Let's check for samples with less than 30 tokens (about the length of a sentence) and see if they are worth keeping?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "id": "mCw52c7myGdY",
        "outputId": "56bd6e25-eb30-4185-a93c-120930f90edc",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Chunk token count: 24.75 | Text: http://www.ajcn.org/content/87/1/64.long. Accessed September 22, 2017. 554 | Water-Soluble Vitamins\n",
            "Chunk token count: 26.5 | Text: It is stored in the rectum until it is expelled through the anus via defecation. The Digestive System | 77\n",
            "Chunk token count: 11.0 | Text: Accessed October 5, 2017. Introduction | 433\n",
            "Chunk token count: 19.25 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=519   Introduction | 991\n",
            "Chunk token count: 17.75 | Text: Table 6.1 Essential and Nonessential Amino Acids Defining Protein | 365\n"
          ]
        }
      ],
      "source": [
        "# Show random chunks with under 30 tokens in length\n",
        "min_token_length = 30\n",
        "for row in df[df[\"chunk_token_count\"] <= min_token_length].sample(5).iterrows():\n",
        "    print(f'Chunk token count: {row[1][\"chunk_token_count\"]} | Text: {row[1][\"sentence_chunk\"]}')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5N_hTluhyGdY"
      },
      "source": [
        "Looks like many of these are headers and footers of different pages.\n",
        "\n",
        "They don't seem to offer too much information.\n",
        "\n",
        "Let's filter our DataFrame/list of dictionaries to only include chunks with over 30 tokens in length."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "id": "-6d8UvjryGdZ",
        "outputId": "54d6477c-1746-40e3-876b-5e8e8fa6988d",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'page_number': -40,\n",
              "  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',\n",
              "  'chunk_char_count': 308,\n",
              "  'chunk_word_count': 42,\n",
              "  'chunk_token_count': 77.0},\n",
              " {'page_number': -39,\n",
              "  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',\n",
              "  'chunk_char_count': 210,\n",
              "  'chunk_word_count': 30,\n",
              "  'chunk_token_count': 52.5}]"
            ]
          },
          "metadata": {},
          "execution_count": 18
        }
      ],
      "source": [
        "pages_and_chunks_over_min_token_len = df[df[\"chunk_token_count\"] > min_token_length].to_dict(orient=\"records\")\n",
        "pages_and_chunks_over_min_token_len[:2]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0PdbYyR5yGdZ"
      },
      "source": [
        "Smaller chunks filtered!\n",
        "\n",
        "Time to embed our chunks of text!"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GlljGim2yGdZ"
      },
      "source": [
        "### Embedding our text chunks\n",
        "\n",
        "While humans understand text, machines understand numbers best.\n",
        "\n",
        "An [embedding](https://vickiboykis.com/what_are_embeddings/index.html) is a broad concept.\n",
        "\n",
        "A simple definitions is \"a useful numerical representation\".\n",
        "\n",
        "The most powerful thing about modern embeddings is that they are *learned* representations.\n",
        "\n",
        "Meaning rather than directly mapping words/tokens/characters to numbers directly (e.g. `{\"a\": 0, \"b\": 1, \"c\": 3...}`), the numerical representation of tokens is learned by going through large corpuses of text and figuring out how different tokens relate to each other.\n",
        "\n",
        "Our goal is to turn each of our chunks into a numerical representation (an embedding vector, where a vector is a sequence of numbers arranged in order).\n",
        "\n",
        "\n",
        "To do so, we'll use the [Cohere](https://cohere.com/embed) embedding model.\n",
        "\n",
        "Specifically, we'll get the `embed-english-v2.0` model (you can see the model's intended use on the [Model](https://docs.cohere.com/reference/embed)).\n",
        "\n",
        "Upload these vector embeddings into [Pinecone](https://docs.pinecone.io/guides/get-started/quickstart)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "id": "v3XzdfNXyGda"
      },
      "outputs": [],
      "source": [
        "# Turn text chunks into a single list\n",
        "text_chunks = [item[\"sentence_chunk\"] for item in pages_and_chunks_over_min_token_len]"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "COHERE_KEY = 'mMJW7g9UDwQCFhtW905hK854aQJPoU13cRevsrvg'"
      ],
      "metadata": {
        "id": "K17nl6ERMbUM"
      },
      "execution_count": 20,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Create Embeddings"
      ],
      "metadata": {
        "id": "BUjgmxH3o_2v"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import cohere\n",
        "\n",
        "co = cohere.Client(COHERE_KEY)"
      ],
      "metadata": {
        "id": "4Wuu7-biMwXw"
      },
      "execution_count": 21,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "\n",
        "# Embed all texts\n",
        "embeds = co.embed(\n",
        "    texts=text_chunks,\n",
        "    model='embed-english-v2.0',\n",
        "    input_type='search_query',\n",
        "    truncate='END'\n",
        ").embeddings"
      ],
      "metadata": {
        "id": "psNUNLrnM7HG",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "aa4272c6-5f34-415c-a581-9fcc3da44bce"
      },
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "CPU times: user 46.6 s, sys: 1.16 s, total: 47.8 s\n",
            "Wall time: 52.9 s\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Check the dimensionality of the returned vectors. We will need to save the embedding dimensionality from this to be used when initializing your Pinecone index later"
      ],
      "metadata": {
        "id": "1j_vH4J7pdjP"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "shape = np.array(embeds).shape\n",
        "shape"
      ],
      "metadata": {
        "id": "KaYqJ8XnN4IN",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "3cc1e958-535b-40bd-aca6-bec61a0b5c1b"
      },
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(1680, 4096)"
            ]
          },
          "metadata": {},
          "execution_count": 23
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We can see the 4096 embedding dimensionality produced by Cohere’s `embed-english-v2.0` model, and the 1680 samples we built embeddings for."
      ],
      "metadata": {
        "id": "2oy7c9CmptEZ"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Store the embeddings"
      ],
      "metadata": {
        "id": "XUUyEyOJpNT2"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now that we have our embeddings, we can move on to indexing them in the Pinecone vector database."
      ],
      "metadata": {
        "id": "mZU5ruxlp8kA"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "We first initialize our connection to Pinecone and then create a new index called cohere-pinecone for storing the embeddings. When creating the index, we specify that we would like to use the cosine similarity metric to align with Cohere’s embeddings, and also pass the embedding dimensionality of 4096."
      ],
      "metadata": {
        "id": "NyqtOoawqHx4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from pinecone import Pinecone, ServerlessSpec\n",
        "import os\n",
        "\n",
        "# Use the API key to initialize the Pinecone client\n",
        "pc = Pinecone(api_key='bdb5ea29-449c-4e3d-8075-a0898d1b8404')\n",
        "\n",
        "index_name = 'cohere-pinecone'\n",
        "\n",
        "# if the index does not exist, we create it\n",
        "if index_name not in pc.list_indexes().names():\n",
        "    pc.create_index(\n",
        "        name=index_name,\n",
        "        dimension=shape[1],\n",
        "        metric=\"cosine\",\n",
        "        spec=ServerlessSpec(\n",
        "            cloud='aws',\n",
        "            region='us-east-1'\n",
        "        )\n",
        "    )\n",
        "\n",
        "# connect to index\n",
        "index = pc.Index(index_name)"
      ],
      "metadata": {
        "id": "Ddena3FORyK8"
      },
      "execution_count": 24,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now we can begin populating the index with our embeddings. Pinecone expects us to provide a list of tuples in the format (*id, vector, metadata*), where the metadata field is an optional extra field where we can store anything we want in a dictionary format. For this example, we will store the original text of the embeddings."
      ],
      "metadata": {
        "id": "bE51nIMaqfcb"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "While uploading our data, we will batch everything to avoid pushing too much data in one go."
      ],
      "metadata": {
        "id": "8iOr_78fqto0"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "batch_size = 128\n",
        "\n",
        "ids = [str(i) for i in range(shape[0])]\n",
        "# create list of metadata dictionaries\n",
        "meta = [{'text': text} for text in text_chunks]\n",
        "\n",
        "# create list of (id, vector, metadata) tuples to be upserted\n",
        "to_upsert = list(zip(ids, embeds, meta))\n",
        "\n",
        "for i in range(0, shape[0], batch_size):\n",
        "    i_end = min(i+batch_size, shape[0])\n",
        "    index.upsert(vectors=to_upsert[i:i_end])\n",
        "\n",
        "# let's view the index statistics\n",
        "index.describe_index_stats()"
      ],
      "metadata": {
        "id": "2nJPhPSfSkLD",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "01a010bf-59a5-49d6-b0d2-e6789380e6ef"
      },
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'dimension': 4096,\n",
              " 'index_fullness': 0.0,\n",
              " 'namespaces': {'': {'vector_count': 1680}},\n",
              " 'total_vector_count': 1680}"
            ]
          },
          "metadata": {},
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We can see from `index.describe_index_stats()` that we have a 4096-dimensionality index populated with 1680 embeddings. Note that serverless indexes scale automatically as needed, so the index_fullness metric is relevant only for pod-based indexes."
      ],
      "metadata": {
        "id": "BJBAU1Bdqz90"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Semantic Search\n",
        "\n",
        "Now that we have our indexed vectors, we can perform a few search queries. When searching, we will first embed our query using Cohere, and then search using the returned vector in Pinecone."
      ],
      "metadata": {
        "id": "0_rRJN4-rIYZ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Functionising the semantic search\n",
        "import textwrap # for wrapping text\n",
        "\n",
        "def search_queries(queries: list[str], k: int = 1) -> dict:\n",
        "  \"\"\"\n",
        "  Function to embed multiple queries, search in Pinecone, and return the top-k results.\n",
        "\n",
        "  Args:\n",
        "  - queries (list): A list of query strings.\n",
        "  - k (int): The number of top results to retrieve for each query (default is 1).\n",
        "\n",
        "  Returns:\n",
        "  - results (dict): A dictionary where each query maps to its top-k results.\n",
        "  \"\"\"\n",
        "  # Step 1: Create embeddings for all queries\n",
        "  query_embeddings = co.embed(\n",
        "      texts=queries,\n",
        "      model='embed-english-v2.0',\n",
        "      input_type='search_query',\n",
        "      truncate='END'\n",
        "  ).embeddings\n",
        "\n",
        "  # Step 2: Perform Pinecone search for each query embedding\n",
        "  all_results = {}\n",
        "\n",
        "  for i, query_embedding in enumerate(query_embeddings):\n",
        "      # Query Pinecone index with each query embedding\n",
        "      res = index.query(vector=query_embedding, top_k=k, include_metadata=True)\n",
        "\n",
        "      # Store the result for each query (as a list of matches)\n",
        "      all_results[queries[i]] = res['matches']\n",
        "\n",
        "  # Step 3: Display results for each query\n",
        "  wrapper = textwrap.TextWrapper(width=80)\n",
        "  for query, matches in all_results.items():\n",
        "      print(f\"Results for Query: {query}\\n\")\n",
        "\n",
        "      # Iterate over the top-k matches\n",
        "      for match in matches:\n",
        "          score = match['score']\n",
        "          text = match['metadata']['text']\n",
        "\n",
        "          # Wrap the text to fit within 80 characters per line\n",
        "          wrapped_text = wrapper.fill(text=text)\n",
        "\n",
        "          # Print the score and corresponding text in a readable format\n",
        "          print(f\"Score: {score:.2f}\")\n",
        "          print(\"Document:\\n\")\n",
        "          print(f\"{wrapped_text}\\n{'-'*50}\\n\")  # Divider to separate results\n",
        "\n",
        "      # Larger divider between different queries\n",
        "      print(f\"\\n{'='*100}\\n\")\n"
      ],
      "metadata": {
        "id": "6zW58XIHvSU8"
      },
      "execution_count": 26,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Let's Look our Result"
      ],
      "metadata": {
        "id": "5z59tIj9sFdM"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "result = search_queries(queries=[\"macro nutrients\"],\n",
        "                        k=1)\n",
        "result"
      ],
      "metadata": {
        "id": "OkoiXD78YdE0",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "0d361fea-587f-43b6-af67-3155e0c9f12b"
      },
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results for Query: macro nutrients\n",
            "\n",
            "Score: 0.63\n",
            "Document:\n",
            "\n",
            "Macronutrients Nutrients that are needed in large amounts are called\n",
            "macronutrients. There are three classes of macronutrients: carbohydrates,\n",
            "lipids, and proteins. These can be metabolically processed into cellular energy.\n",
            "The energy from macronutrients comes from their chemical bonds. This chemical\n",
            "energy is converted into cellular energy that is then utilized to perform work,\n",
            "allowing our bodies to conduct their basic functions. A unit of measurement of\n",
            "food energy is the calorie. On nutrition food labels the amount given for\n",
            "“calories” is actually equivalent to each calorie multiplied by one thousand. A\n",
            "kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with\n",
            "the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a\n",
            "macronutrient in the sense that you require a large amount of it, but unlike the\n",
            "other macronutrients, it does not yield calories. Carbohydrates Carbohydrates\n",
            "are molecules composed of carbon, hydrogen, and oxygen.\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Several example queries"
      ],
      "metadata": {
        "id": "gwa0gATQ7eKE"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Multiple-queries"
      ],
      "metadata": {
        "id": "yXFSlz3E9Xp8"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "result = search_queries(queries=[\"what is carbohydrates\", \"what is fats\", \"What is Starch\"],\n",
        "                        k=1)\n",
        "result"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bURHakxd7rEV",
        "outputId": "2985986f-f657-41f0-a48c-2b44f65069c1"
      },
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results for Query: what is carbohydrates\n",
            "\n",
            "Score: 0.62\n",
            "Document:\n",
            "\n",
            "Carbohydrat es are broken down into the subgroups simple and complex\n",
            "carbohydrate s. These subgroups are further categorized into mono-, di-, and\n",
            "polysacchari des. indigestible carbohydrates provide a good amount of fiber with\n",
            "a host of other health benefits. Plants synthesize the fast-releasing\n",
            "carbohydrate, glucose, from carbon dioxide in the air and water, and by\n",
            "harnessing the sun’s energy. Recall that plants convert the energy in sunlight\n",
            "to chemical energy in the molecule, glucose. Plants use glucose to make other\n",
            "larger, more slow-releasing carbohydrates. When we eat plants we harvest the\n",
            "energy of glucose to support life’s processes. Figure 4.1 Carbohydrate\n",
            "Classification Scheme Carbohydrates are a group of organic compounds containing\n",
            "a ratio of one carbon atom to two hydrogen atoms to one oxygen atom. Basically,\n",
            "they are hydrated carbons. The word “carbo” means carbon and “hydrate” means\n",
            "water. Glucose, the most abundant carbohydrate in the human body, has six carbon\n",
            "atoms, twelve hydrogen atoms, and six oxygen atoms.\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: what is fats\n",
            "\n",
            "Score: 0.59\n",
            "Document:\n",
            "\n",
            "scarce. Our ability to store excess caloric energy as fat for future usage\n",
            "allowed us to continue as a species during these times of famine. So, normal fat\n",
            "reserves are a signal that metabolic processes are efficient and a person is\n",
            "healthy. Lipids are a family of organic compounds that are mostly insoluble in\n",
            "water. Composed of fats and oils, lipids are molecules that yield high energy\n",
            "and have a chemical composition mainly of carbon, hydrogen, and oxygen. Lipids\n",
            "perform three primary biological functions within the body: they serve as\n",
            "structural components of cell membranes, function as energy storehouses, and\n",
            "function as important signaling molecules. The three main types of lipids are\n",
            "triglycerides, phospholipids, and sterols. Triglycerides make up more than 95\n",
            "percent of lipids in the diet and are commonly found in fried foods, vegetable\n",
            "oil, butter, whole milk, cheese, cream cheese, and some meats. Naturally\n",
            "occurring triglycerides are found in many foods, including avocados, olives,\n",
            "corn, and nuts. We commonly call the triglycerides in our food “fats” and\n",
            "“oils.”\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: What is Starch\n",
            "\n",
            "Score: 0.53\n",
            "Document:\n",
            "\n",
            "Complex/Slow-Releasing Carbohydrates Complex carbohydrates are polysaccharides,\n",
            "long chains of monosaccharides that may be branched or not branched. There are\n",
            "two main groups of polysaccharides: starches and fibers. Starches Starch\n",
            "molecules are found in abundance in grains, legumes, and root vegetables, such\n",
            "as potatoes. Amylose, a plant starch, is a linear chain containing hundreds of\n",
            "glucose units. Amylopectin, another plant starch, is a branched chain containing\n",
            "thousands of glucose units. These large starch molecules form crystals and are\n",
            "the energy-storing molecules of plants. These two starch molecules (amylose and\n",
            "amylopectin) are contained together in foods, but the smaller one, amylose, is\n",
            "less abundant. Eating raw foods containing starches provides very little energy\n",
            "as the digestive system has a hard time breaking them down. Cooking breaks down\n",
            "the crystal structure of starches, making them much easier to break down in the\n",
            "human body. The starches that remain intact throughout digestion are called\n",
            "resistant starches.\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "result = search_queries(queries=[\"what are fat-soluble vitamins?\", \"What are the causes of type 2 diabetes?\"],\n",
        "                        k=1)\n",
        "result"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "i0VgFvI5787K",
        "outputId": "8bb51c7f-744e-43ec-9b6d-b6fd414400f7"
      },
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results for Query: what are fat-soluble vitamins?\n",
            "\n",
            "Score: 0.67\n",
            "Document:\n",
            "\n",
            "subcutaneous fat, or fat underneath the skin. This blanket layer of tissue\n",
            "insulates the body from extreme temperatures and helps keep the internal climate\n",
            "under control. It pads our hands and buttocks and prevents friction, as these\n",
            "areas frequently come in contact with hard surfaces. It also gives the body the\n",
            "extra padding required when engaging in physically demanding activities such as\n",
            "ice- or roller skating, horseback riding, or snowboarding. Aiding Digestion and\n",
            "Increasing Bioavailability The dietary fats in the foods we eat break down in\n",
            "our digestive systems and begin the transport of precious micronutrients. By\n",
            "carrying fat-soluble nutrients through the digestive process, intestinal\n",
            "absorption is improved. This improved absorption is also known as increased\n",
            "bioavailability. Fat-soluble nutrients are especially important for good health\n",
            "and exhibit a variety of functions. Vitamins A, D, E, and K—the fat-soluble\n",
            "vitamins—are mainly found in foods containing fat. Some fat-soluble vitamins\n",
            "(such as vitamin A) are also found in naturally fat-free foods such as green\n",
            "leafy vegetables, carrots, and broccoli.\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: What are the causes of type 2 diabetes?\n",
            "\n",
            "Score: 0.76\n",
            "Document:\n",
            "\n",
            "To see how the rise in obesity in this country is paralleled by the rise in Type\n",
            "2 diabetes, review this report by the CDC.\n",
            "https://www.cdc.gov/diabetes/statistics/slides/ maps_diabetesobesity_trends.pdf \n",
            "What is the causal relationship between overnutrition and Type 2 diabetes?The\n",
            "prevailing theory is that the overconsumption of high-fat and high-sugar foods\n",
            "causes changes in muscle, fat, and liver cells that leads to a diminished\n",
            "response from the pancreatic hormone insulin. These cells are called “insulin-\n",
            "resistant.”Insulin is released after a meal and instructs the liver and other\n",
            "tissues to take up glucose and fatty acids that are circulating in the blood.\n",
            "When cells are resistant to insulin they do not take up enough glucose and fatty\n",
            "acids, so glucose and fatty acids remain at high concentrations in the blood.\n",
            "The chronic elevation of glucose and fatty acids in the blood also causes damage\n",
            "to other tissues over time, so that people who have Type 2 diabetes are at\n",
            "increased risk for cardiovascular disease, kidney disease, nerve damage, and eye\n",
            "disease. The Endocrine System | 107\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "result = search_queries(queries=[\"What is the importance of hydration for physical performance?\", \"What role does fibre play in digestion?\", \"What is the RDA for protein per day?\"],\n",
        "                        k=1)\n",
        "result"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b6C1DkwI859s",
        "outputId": "6ea955e4-df86-4b9c-e396-a95316583fb0"
      },
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results for Query: What is the importance of hydration for physical performance?\n",
            "\n",
            "Score: 0.68\n",
            "Document:\n",
            "\n",
            "Image by Allison Calabrese / CC BY 4.0 Water and Electrolyte Needs UNIVERSITY OF\n",
            "HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION\n",
            "PROGRAM During exercise, being appropriately hydrated contributes to\n",
            "performance. Water is needed to cool the body, transport oxygen and nutrients,\n",
            "and remove waste products from the muscles. Water needs are increased during\n",
            "exercise due to the extra water losses through evaporation and sweat.\n",
            "Dehydration can occur when there is inadequate water levels in the body and can\n",
            "be very hazardous to the health of an individual. As the severity of dehydration\n",
            "increases, the exercise performance of an individual will begin to decline (see\n",
            "Figure 16.9 “Dehydration Effect on Exercise Performance”). It is important to\n",
            "continue to consume water before, during and after exercise to avoid dehydration\n",
            "as much as possible. Figure 16.9 Dehydration Effect on Exercise Performance\n",
            "During exercise, thirst is not a reliable short term indicator of the body’s\n",
            "needs as it typically is not enough to replace the water loss. Even with the\n",
            "constant replenishing of water throughout an 972 | Water and Electrolyte Needs\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: What role does fibre play in digestion?\n",
            "\n",
            "Score: 0.62\n",
            "Document:\n",
            "\n",
            "Image by Allison Calabrese / CC BY 4.0 fiber intake because of what the\n",
            "breakdown products of the fiber do for the colon. The bacterial breakdown of\n",
            "fiber in the large intestine releases short-chain fatty acids. These molecules\n",
            "have been found to nourish colonic cells, inhibit colonic inflammation, and\n",
            "stimulate the immune system (thereby providing protection of the colon from\n",
            "harmful substances). Additionally, the bacterial indigestible fiber, mostly\n",
            "insoluble, increases stool bulk and softness increasing transit time in the\n",
            "large intestine and facilitating feces elimination. One phenomenon of consuming\n",
            "foods high in fiber is increased gas, since the byproducts of bacterial\n",
            "digestion of fiber are gases. Figure 18.2 Diverticulitis: A Disease of Fiber\n",
            "Deficiency Some studies have found a link between high dietary-fiber intake and\n",
            "a decreased risk for colon cancer. However an analysis of 1086 | Nutrition,\n",
            "Health and Disease\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: What is the RDI for protein per day?\n",
            "\n",
            "Score: 0.66\n",
            "Document:\n",
            "\n",
            "Proteins, Diet, and Personal Choices UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE\n",
            "AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM We have discussed what\n",
            "proteins are, how they are made, how they are digested and absorbed, the many\n",
            "functions of proteins in the body, and the consequences of having too little or\n",
            "too much protein in the diet. This section will provide you with information on\n",
            "how to determine the recommended amount of protein for you, and your many\n",
            "choices in designing an optimal diet with high-quality protein sources. How Much\n",
            "Protein Does a Person Need in Their Diet? The recommendations set by the IOM for\n",
            "the Recommended Daily Allowance (RDA) and AMDR for protein for different age\n",
            "groups are listed in Table 6.2 “Dietary Reference Intakes for Protein”. A\n",
            "Tolerable Upper Intake Limit for protein has not been set, but it is recommended\n",
            "that you do not exceed the upper end of the AMDR. Table 6.2 Dietary Reference\n",
            "Intakes for Protein Proteins, Diet, and Personal Choices | 409\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "result = search_queries(queries=[\"what are other health benefits of Calcium in the body?\", \"define weight gain during pregnancy?\", \"How does saliva help with digestion?\"],\n",
        "                        k=1)\n",
        "result"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZlEQT2_M-FYO",
        "outputId": "e4471aef-2c64-44fb-83c9-fbca73ed0abd"
      },
      "execution_count": 50,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results for Query: what are other health benefits of Calcium in the body?\n",
            "\n",
            "Score: 0.78\n",
            "Document:\n",
            "\n",
            "Image by Allison Calabrese / CC BY 4.0 Other Health Benefits of Calcium in the\n",
            "Body Besides forming and maintaining strong bones and teeth, calcium has been\n",
            "shown to have other health benefits for the body, including: • Cancer. The\n",
            "National Cancer Institute reports that there is enough scientific evidence to\n",
            "conclude that higher intakes of calcium decrease colon cancer risk and may\n",
            "suppress the growth of polyps that often precipitate cancer. Although higher\n",
            "calcium consumption protects against colon cancer, some studies have looked at\n",
            "the relationship between calcium and prostate cancer and found higher intakes\n",
            "may increase the risk for prostate cancer; however the data is inconsistent and\n",
            "more studies are needed to confirm any negative association. • Blood pressure.\n",
            "Multiple studies provide clear evidence that higher calcium consumption reduces\n",
            "blood pressure. A review of twenty-three observational studies concluded that\n",
            "for every Calcium | 615\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: define weight gain during pregnancy?\n",
            "\n",
            "Score: 0.69\n",
            "Document:\n",
            "\n",
            "oranges. Additionally, since 1998, food manufacturers have been required to add\n",
            "folate to cereals and other grain products.2 Weight Gain during Pregnancy During\n",
            "pregnancy, a mother’s body changes in many ways. One of the most notable and\n",
            "significant changes is weight gain. If a pregnant woman does not gain enough\n",
            "weight, her unborn baby will be at risk. Poor weight gain, especially in the\n",
            "second and third trimesters, could result not only in low birth weight, but also\n",
            "infant mortality and intellectual disabilities. Therefore, it is vital for a\n",
            "pregnant woman to maintain a healthy amount of weight gain. Her weight prior to\n",
            "pregnancy also has a major effect. Infant birth weight is one of the best\n",
            "indicators of a baby’s future health. Pregnant women of normal prepregnancy\n",
            "weight should gain between 25 and 35 pounds in total through the entire\n",
            "pregnancy. The precise amount that a mother should gain usually depends on her\n",
            "beginning weight or body mass index (BMI).\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: How does saliva help with digestion?\n",
            "\n",
            "Score: 0.54\n",
            "Document:\n",
            "\n",
            "Digestion and Absorption of Carbohydrates UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD\n",
            "SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM From the Mouth\n",
            "to the Stomach The mechanical and chemical digestion of carbohydrates begins in\n",
            "the mouth. Chewing, also known as mastication, crumbles the carbohydrate foods\n",
            "into smaller and smaller pieces. The salivary glands in the oral cavity secrete\n",
            "saliva that coats the food particles. Saliva contains the enzyme, salivary\n",
            "amylase. This enzyme breaks the bonds between the monomeric sugar units of\n",
            "disaccharides, oligosaccharides, and starches. The salivary amylase breaks down\n",
            "amylose and amylopectin into smaller chains of glucose, called dextrins and\n",
            "maltose. The increased concentration of maltose in the mouth that results from\n",
            "the mechanical and chemical breakdown of starches in whole grains is what\n",
            "enhances their sweetness. Only about five percent of starches are broken down in\n",
            "the mouth. (This is a good thing as more glucose in the mouth would lead to more\n",
            "tooth decay.)When carbohydrates reach the stomach no further chemical breakdown\n",
            "occurs because the amylase enzyme does not function in the acidic conditions of\n",
            "the stomach.\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "result = search_queries(queries=[\"How often should infants be breastfed??\", \"what is water soluble vitamins\", \"What are symptoms of pellagra?\"],\n",
        "                        k=1)\n",
        "result"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JQkVBet9_wI3",
        "outputId": "ec3a34c9-55b7-44e7-b94c-93810239e022"
      },
      "execution_count": 51,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results for Query: How often should infants be breastfed??\n",
            "\n",
            "Score: 0.57\n",
            "Document:\n",
            "\n",
            "milk is the best source to fulfill nutritional requirements. An exclusively\n",
            "breastfed infant does not even need extra water, including in hot climates. A\n",
            "newborn infant (birth to 28 days) requires feedings eight to twelve times a day\n",
            "or more. Between 1 and 3 months of age, the breastfed infant becomes more\n",
            "efficient, and the number of feedings per day often become fewer even though the\n",
            "amount of milk consumed stays the same. After about six months, infants can\n",
            "gradually begin to consume solid foods to help meet nutrient needs. Foods that\n",
            "are added in addition to breastmilk are called complementary foods.\n",
            "Complementary foods should be nutrient dense to provide optimal nutrition.\n",
            "Complementary foods include baby meats, vegetables, fruits, infant cereal, and\n",
            "dairy products such as yogurt, but not infant formula. Infant formula is a\n",
            "substitute, not a complement to breastmilk. In addition to complementary foods,\n",
            "the World Health Organization recommends that breastfeeding continue up to 2\n",
            "years of age or beyond, and the American Academy of Pediatrics recommends at\n",
            "least one year of breastfeeding, or longer.\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: what is water soluble vitamins\n",
            "\n",
            "Score: 0.64\n",
            "Document:\n",
            "\n",
            "Water-Soluble Vitamins UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN\n",
            "NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM All water-soluble vitamins play a\n",
            "different kind of role in energy metabolism; they are required as functional\n",
            "parts of enzymes involved in energy release and storage. Vitamins and minerals\n",
            "that make up part of enzymes are referred to as coenzymes and cofactors,\n",
            "respectively. Coenzymes and cofactors are required by enzymes to catalyze a\n",
            "specific reaction. They assist in converting a substrate to an end-product.\n",
            "Coenzymes and cofactors are essential in catabolic pathways and play a role in\n",
            "many anabolic pathways too. In addition to being essential for metabolism, many\n",
            "vitamins and minerals are required for blood renewal and function. At\n",
            "insufficient levels in the diet these vitamins and minerals impair the health of\n",
            "blood and consequently the delivery of nutrients in and wastes out, amongst its\n",
            "many other functions. In this section we will focus on the vitamins that take\n",
            "part in metabolism and blood function and renewal. Figure 9.7 Enzyme Active Site\n",
            "for Cofactors 550 | Water-Soluble Vitamins\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n",
            "Results for Query: What are symptoms of pellagra?\n",
            "\n",
            "Score: 0.59\n",
            "Document:\n",
            "\n",
            "Niacin deficiency is commonly known as pellagra and the symptoms include\n",
            "fatigue, decreased appetite, and indigestion.  These symptoms are then commonly\n",
            "followed by the four D’s: diarrhea, dermatitis, dementia, and sometimes death.\n",
            "Figure 9.12  Conversion of Tryptophan to Niacin Water-Soluble Vitamins | 565\n",
            "--------------------------------------------------\n",
            "\n",
            "\n",
            "====================================================================================================\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "At end lets delete our Pinecone index."
      ],
      "metadata": {
        "id": "jYUgY2F1sIui"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "pc.delete_index(index_name)"
      ],
      "metadata": {
        "id": "Ywo6nYAfV4hv"
      },
      "execution_count": 52,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.8"
    },
    "colab": {
      "provenance": [],
      "toc_visible": true,
      "gpuType": "T4"
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "662d657066044012ba174d1a7a993aa7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_b123cfb995154c5982344f77c7d97118",
              "IPY_MODEL_28e55837d06840efbe1a70b768d165b2",
              "IPY_MODEL_5fe003b2e9f841b8b1f34bdcce4569bd"
            ],
            "layout": "IPY_MODEL_2bc69c558d664d5194582deb344b3ae6"
          }
        },
        "b123cfb995154c5982344f77c7d97118": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_474dfac0c13d49e0a603f73369dcc42e",
            "placeholder": "​",
            "style": "IPY_MODEL_0d2acff2bdd54e8b9fddfaf685144ae7",
            "value": ""
          }
        },
        "28e55837d06840efbe1a70b768d165b2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_cbf9ece884a24a7db323757c719a5ae8",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_924c014aca3741aa808b705aa66864ed",
            "value": 1
          }
        },
        "5fe003b2e9f841b8b1f34bdcce4569bd": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_b72b7ebaaa0744699ba2e577a4911b59",
            "placeholder": "​",
            "style": "IPY_MODEL_9a3b5eddeed443aba5908ad2f4e3b588",
            "value": " 1208/? [00:05&lt;00:00, 148.14it/s]"
          }
        },
        "2bc69c558d664d5194582deb344b3ae6": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "474dfac0c13d49e0a603f73369dcc42e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "0d2acff2bdd54e8b9fddfaf685144ae7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "cbf9ece884a24a7db323757c719a5ae8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": "20px"
          }
        },
        "924c014aca3741aa808b705aa66864ed": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "b72b7ebaaa0744699ba2e577a4911b59": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "9a3b5eddeed443aba5908ad2f4e3b588": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "94cc52bff24e4c158e2859195a142498": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_a069ef0cde334591a5dc9cc3f2a5aacf",
              "IPY_MODEL_02e443fff37f4cceab1996ea4893dd3c",
              "IPY_MODEL_d76e01440712420eb7249bf8e6fc1aaa"
            ],
            "layout": "IPY_MODEL_f5809653a4a94e8f80452f1703dc47a5"
          }
        },
        "a069ef0cde334591a5dc9cc3f2a5aacf": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_6792e77105e54799bdd471eec04c7c5d",
            "placeholder": "​",
            "style": "IPY_MODEL_a84fdc3489ad4e78892906e5330e21f2",
            "value": "100%"
          }
        },
        "02e443fff37f4cceab1996ea4893dd3c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_80941748295241d2b712292647b3c623",
            "max": 1208,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_91bc9829243545008533e7a21bb44d8f",
            "value": 1208
          }
        },
        "d76e01440712420eb7249bf8e6fc1aaa": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_a2adca3f8a484ab48b859922f93f380d",
            "placeholder": "​",
            "style": "IPY_MODEL_86b0843d4fc4438fb312d40ad9447dcc",
            "value": " 1208/1208 [00:03&lt;00:00, 441.08it/s]"
          }
        },
        "f5809653a4a94e8f80452f1703dc47a5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "6792e77105e54799bdd471eec04c7c5d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a84fdc3489ad4e78892906e5330e21f2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "80941748295241d2b712292647b3c623": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "91bc9829243545008533e7a21bb44d8f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "a2adca3f8a484ab48b859922f93f380d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "86b0843d4fc4438fb312d40ad9447dcc": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "1016a3c98b6448b7a11852633587ab3f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_74b707712fce45cd98a16b71908916db",
              "IPY_MODEL_494fc75e07d14ca794c9b152e100953e",
              "IPY_MODEL_20c69b545a6d4bbb86344abfea24da58"
            ],
            "layout": "IPY_MODEL_8ffe48cbd52245ca849696746b9dec74"
          }
        },
        "74b707712fce45cd98a16b71908916db": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_711f9bc8e4fb4e5c90aced0384b4cb54",
            "placeholder": "​",
            "style": "IPY_MODEL_67e2aafea7754c23906a8b35e9bd2f08",
            "value": "100%"
          }
        },
        "494fc75e07d14ca794c9b152e100953e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_2e86c19b91f2419e9eb181e105dfb3b0",
            "max": 1208,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_5a03c35e76ca47c5a973023980862c76",
            "value": 1208
          }
        },
        "20c69b545a6d4bbb86344abfea24da58": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_58766a63fc7f4188b41ac32b386769f4",
            "placeholder": "​",
            "style": "IPY_MODEL_1ce1ca10b5b0434aa17417b63db75ab4",
            "value": " 1208/1208 [00:00&lt;00:00, 44330.97it/s]"
          }
        },
        "8ffe48cbd52245ca849696746b9dec74": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "711f9bc8e4fb4e5c90aced0384b4cb54": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "67e2aafea7754c23906a8b35e9bd2f08": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "2e86c19b91f2419e9eb181e105dfb3b0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5a03c35e76ca47c5a973023980862c76": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "58766a63fc7f4188b41ac32b386769f4": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "1ce1ca10b5b0434aa17417b63db75ab4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "b19f39b1fd314d71885b15154debce11": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_9f9948c49b5e4a5e87631d4137bdca3e",
              "IPY_MODEL_4eaeda1e6d94473cb39ca9ad6204b607",
              "IPY_MODEL_2cf3513015d243e7842f6120f939a706"
            ],
            "layout": "IPY_MODEL_2f2f1c2e4df346cbb7b6775e0aa0c55a"
          }
        },
        "9f9948c49b5e4a5e87631d4137bdca3e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ebd0595644334e02abfbafd31264196e",
            "placeholder": "​",
            "style": "IPY_MODEL_6ac351005e7247f391c684e620744259",
            "value": "100%"
          }
        },
        "4eaeda1e6d94473cb39ca9ad6204b607": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_fe005c591c2448fb8f3f4f0bd5cc0bb1",
            "max": 1208,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_980ae4efbcb64f758e1311e4e2b0cad0",
            "value": 1208
          }
        },
        "2cf3513015d243e7842f6120f939a706": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_14c9b952454c40b59325b63be4cc2d7e",
            "placeholder": "​",
            "style": "IPY_MODEL_5a646e132031418d8cefea0a99f22444",
            "value": " 1208/1208 [00:00&lt;00:00, 12366.60it/s]"
          }
        },
        "2f2f1c2e4df346cbb7b6775e0aa0c55a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ebd0595644334e02abfbafd31264196e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "6ac351005e7247f391c684e620744259": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "fe005c591c2448fb8f3f4f0bd5cc0bb1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "980ae4efbcb64f758e1311e4e2b0cad0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "14c9b952454c40b59325b63be4cc2d7e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5a646e132031418d8cefea0a99f22444": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}