{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "3X2X_7DoyGdH"
},
"source": [
"# Create and run a local RAG pipeline from scratch\n",
"\n",
"The goal of this notebook is to build a RAG (Retrieval Augmented Generation) pipeline from scratch.\n",
"\n",
"Specifically, we'd like to be able to open a PDF file, ask questions (queries) of it and have them answered by a Large Language Model (LLM)."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "3YE8fl97yGdO"
},
"source": [
"## Requirements and setup"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "PhpZmm-3yGdO",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "50ba27d4-bab7-4798-976a-272cdd0ff2d0"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[INFO] Running in Google Colab, installing requirements.\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.4.0+cu121)\n",
"Collecting torch\n",
" Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.16.0)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2024.6.1)\n",
"Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)\n",
" Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
"Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)\n",
" Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
"Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)\n",
" Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
"Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)\n",
" Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
"Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)\n",
" Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
"Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)\n",
" Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
"Collecting nvidia-curand-cu12==10.3.2.106 (from torch)\n",
" Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
"Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch)\n",
" Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
"Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch)\n",
" Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
"Collecting nvidia-nccl-cu12==2.20.5 (from torch)\n",
" Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n",
"Collecting nvidia-nvtx-cu12==12.1.105 (from torch)\n",
" Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n",
"Collecting triton==3.0.0 (from torch)\n",
" Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)\n",
"Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch)\n",
" Downloading nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n",
"Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl (797.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m797.1/797.1 MB\u001b[0m \u001b[31m842.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m64.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m37.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.2/176.2 MB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.4/209.4 MB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_x86_64.whl (19.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.7/19.7 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: triton, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch\n",
" Attempting uninstall: nvidia-nccl-cu12\n",
" Found existing installation: nvidia-nccl-cu12 2.23.4\n",
" Uninstalling nvidia-nccl-cu12-2.23.4:\n",
" Successfully uninstalled nvidia-nccl-cu12-2.23.4\n",
" Attempting uninstall: torch\n",
" Found existing installation: torch 2.4.0+cu121\n",
" Uninstalling torch-2.4.0+cu121:\n",
" Successfully uninstalled torch-2.4.0+cu121\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"torchaudio 2.4.0+cu121 requires torch==2.4.0, but you have torch 2.4.1 which is incompatible.\n",
"torchvision 0.19.0+cu121 requires torch==2.4.0, but you have torch 2.4.1 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.6.68 nvidia-nvtx-cu12-12.1.105 torch-2.4.1 triton-3.0.0\n",
"Collecting PyMuPDF\n",
" Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
"Collecting PyMuPDFb==1.24.10 (from PyMuPDF)\n",
" Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)\n",
"Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m102.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: PyMuPDFb, PyMuPDF\n",
"Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.66.5)\n",
"Collecting cohere\n",
" Downloading cohere-5.9.2-py3-none-any.whl.metadata (3.4 kB)\n",
"Collecting pinecone-client\n",
" Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)\n",
"Collecting boto3<2.0.0,>=1.34.0 (from cohere)\n",
" Downloading boto3-1.35.20-py3-none-any.whl.metadata (6.6 kB)\n",
"Collecting fastavro<2.0.0,>=1.9.4 (from cohere)\n",
" Downloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)\n",
"Collecting httpx>=0.21.2 (from cohere)\n",
" Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n",
"Collecting httpx-sse==0.4.0 (from cohere)\n",
" Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n",
"Collecting parameterized<0.10.0,>=0.9.0 (from cohere)\n",
" Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)\n",
"Requirement already satisfied: pydantic>=1.9.2 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.9.1)\n",
"Requirement already satisfied: pydantic-core<3.0.0,>=2.18.2 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.23.3)\n",
"Requirement already satisfied: requests<3.0.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.32.3)\n",
"Requirement already satisfied: tokenizers<1,>=0.15 in /usr/local/lib/python3.10/dist-packages (from cohere) (0.19.1)\n",
"Collecting types-requests<3.0.0,>=2.0.0 (from cohere)\n",
" Downloading types_requests-2.32.0.20240914-py3-none-any.whl.metadata (1.9 kB)\n",
"Requirement already satisfied: typing_extensions>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (4.12.2)\n",
"Requirement already satisfied: certifi>=2019.11.17 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2024.8.30)\n",
"Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)\n",
" Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)\n",
"Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)\n",
" Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)\n",
"Requirement already satisfied: tqdm>=4.64.1 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (4.66.5)\n",
"Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2.0.7)\n",
"Collecting botocore<1.36.0,>=1.35.20 (from boto3<2.0.0,>=1.34.0->cohere)\n",
" Downloading botocore-1.35.20-py3-none-any.whl.metadata (5.7 kB)\n",
"Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.34.0->cohere)\n",
" Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n",
"Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.34.0->cohere)\n",
" Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)\n",
"Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.21.2->cohere) (3.7.1)\n",
"Collecting httpcore==1.* (from httpx>=0.21.2->cohere)\n",
" Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx>=0.21.2->cohere) (3.8)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.21.2->cohere) (1.3.1)\n",
"Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.21.2->cohere)\n",
" Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.9.2->cohere) (0.7.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->cohere) (3.3.2)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from tokenizers<1,>=0.15->cohere) (0.24.7)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.10/dist-packages (from botocore<1.36.0,>=1.35.20->boto3<2.0.0,>=1.34.0->cohere) (2.8.2)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (3.16.0)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (2024.6.1)\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (24.1)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers<1,>=0.15->cohere) (6.0.2)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx>=0.21.2->cohere) (1.2.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.36.0,>=1.35.20->boto3<2.0.0,>=1.34.0->cohere) (1.16.0)\n",
"Downloading cohere-5.9.2-py3-none-any.whl (222 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.4/222.4 kB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n",
"Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.8/244.8 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading boto3-1.35.20-py3-none-any.whl (139 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m97.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading parameterized-0.9.0-py2.py3-none-any.whl (20 kB)\n",
"Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.4/85.4 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)\n",
"Downloading types_requests-2.32.0.20240914-py3-none-any.whl (15 kB)\n",
"Downloading botocore-1.35.20-py3-none-any.whl (12.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m104.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
"Downloading s3transfer-0.10.2-py3-none-any.whl (82 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: types-requests, pinecone-plugin-interface, parameterized, jmespath, httpx-sse, h11, fastavro, pinecone-plugin-inference, httpcore, botocore, s3transfer, pinecone-client, httpx, boto3, cohere\n",
"Successfully installed boto3-1.35.20 botocore-1.35.20 cohere-5.9.2 fastavro-1.9.7 h11-0.14.0 httpcore-1.0.5 httpx-0.27.2 httpx-sse-0.4.0 jmespath-1.0.1 parameterized-0.9.0 pinecone-client-5.0.1 pinecone-plugin-inference-1.1.0 pinecone-plugin-interface-0.0.7 s3transfer-0.10.2 types-requests-2.32.0.20240914\n",
"Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.34.2)\n",
"Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.26.4)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (24.1)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0.2)\n",
"Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.4.1)\n",
"Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.24.7)\n",
"Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.4.5)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (3.16.0)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2024.6.1)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\n",
"Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.5)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.13.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.4)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.3.1)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (11.0.2.54)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (10.3.2.106)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (11.4.5.107)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.0.106)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.20.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
"Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.0.0)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.10.0->accelerate) (12.6.68)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.8)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.8.30)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n",
"Collecting bitsandbytes\n",
" Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (2.4.1)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (1.26.4)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.16.0)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (4.12.2)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (1.13.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.1.4)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (2024.6.1)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.3.1)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (11.0.2.54)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (10.3.2.106)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (11.4.5.107)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.0.106)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (2.20.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (12.1.105)\n",
"Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.0.0)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->bitsandbytes) (12.6.68)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->bitsandbytes) (2.1.5)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->bitsandbytes) (1.3.0)\n",
"Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.5/137.5 MB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: bitsandbytes\n",
"Successfully installed bitsandbytes-0.43.3\n",
"Collecting flash-attn\n",
" Downloading flash_attn-2.6.3.tar.gz (2.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from flash-attn) (2.4.1)\n",
"Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages (from flash-attn) (0.8.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.16.0)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (4.12.2)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (1.13.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.1.4)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2024.6.1)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.3.1)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (11.0.2.54)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (10.3.2.106)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (11.4.5.107)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.0.106)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2.20.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
"Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.0.0)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->flash-attn) (12.6.68)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->flash-attn) (2.1.5)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->flash-attn) (1.3.0)\n",
"Building wheels for collected packages: flash-attn\n",
" Building wheel for flash-attn (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24\n",
" Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826\n",
"Successfully built flash-attn\n",
"Installing collected packages: flash-attn\n",
"Successfully installed flash-attn-2.6.3\n"
]
}
],
"source": [
"# Perform Google Colab installs (if running in Google Colab)\n",
"import os\n",
"\n",
"if \"COLAB_GPU\" in os.environ:\n",
" print(\"[INFO] Running in Google Colab, installing requirements.\")\n",
" !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)\n",
" !pip install PyMuPDF # for reading PDFs with Python\n",
" !pip install tqdm # for progress bars\n",
" !pip install cohere pinecone-client # for embedding models and vector database\n",
" !pip install accelerate # for quantization model loading\n",
" !pip install bitsandbytes # for quantizing models (less storage space)\n",
" !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DWlEFjkeyGdP"
},
"source": [
"## 1. Document/Text Processing and Embedding Creation\n",
"\n",
"Ingredients:\n",
"* PDF document of choice.\n",
"* Embedding model Cohere.\n",
"\n",
"Steps:\n",
"1. Import PDF document.\n",
"2. Process text for embedding (e.g. split into chunks of sentences).\n",
"3. Embed text chunks with embedding model.\n",
"4. Save embeddings to vector database Pinecone."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "V4yEvLvRyGdP"
},
"source": [
"### Import PDF Document\n",
"\n",
"We're going to work on the open-source PDF textbook [*Human Nutrition: 2020 Edition*](https://pressbooks.oer.hawaii.edu/humannutrition2/).\n",
"\n",
"There are several libraries to open PDFs with Python but I found that [PyMuPDF](https://github.com/pymupdf/pymupdf) works quite well.\n",
"\n",
"First we'll download the PDF if it doesn't exist."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "-ULZl6gVyGdQ",
"outputId": "4b65c581-be7f-4b71-8b27-a791676e041a",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"File doesn't exist, downloading...\n",
"The file has been downloaded and saved as human-nutrition-text.pdf\n"
]
}
],
"source": [
"# Download PDF file\n",
"import os\n",
"import requests\n",
"\n",
"# Get PDF document\n",
"pdf_path = \"human-nutrition-text.pdf\"\n",
"\n",
"# Download PDF if it doesn't already exist\n",
"if not os.path.exists(pdf_path):\n",
" print(\"File doesn't exist, downloading...\")\n",
"\n",
" # The URL of the PDF you want to download\n",
" url = \"https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf\"\n",
"\n",
" # The local filename to save the downloaded file\n",
" filename = pdf_path\n",
"\n",
" # Send a GET request to the URL\n",
" response = requests.get(url)\n",
"\n",
" # Check if the request was successful\n",
" if response.status_code == 200:\n",
" # Open a file in binary write mode and save the content to it\n",
" with open(filename, \"wb\") as file:\n",
" file.write(response.content)\n",
" print(f\"The file has been downloaded and saved as {filename}\")\n",
" else:\n",
" print(f\"Failed to download the file. Status code: {response.status_code}\")\n",
"else:\n",
" print(f\"File {pdf_path} exists.\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1jNab_TuyGdQ"
},
"source": [
"PDF acquired!\n",
"\n",
"We can import the pages of our PDF to text by first defining the PDF path and then opening and reading it with PyMuPDF (`import fitz`).\n",
"\n",
"We'll write a small helper function to preprocess the text as it gets read.\n",
"\n",
"We'll save each page to a dictionary and then append that dictionary to a list for ease of use later."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "texuMJJKyGdQ",
"outputId": "2dd5f8f9-e07f-4fac-a910-b282636c9cf1",
"colab": {
"referenced_widgets": [
"662d657066044012ba174d1a7a993aa7",
"b123cfb995154c5982344f77c7d97118",
"28e55837d06840efbe1a70b768d165b2",
"5fe003b2e9f841b8b1f34bdcce4569bd",
"2bc69c558d664d5194582deb344b3ae6",
"474dfac0c13d49e0a603f73369dcc42e",
"0d2acff2bdd54e8b9fddfaf685144ae7",
"cbf9ece884a24a7db323757c719a5ae8",
"924c014aca3741aa808b705aa66864ed",
"b72b7ebaaa0744699ba2e577a4911b59",
"9a3b5eddeed443aba5908ad2f4e3b588"
],
"base_uri": "https://localhost:8080/",
"height": 257
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"0it [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "662d657066044012ba174d1a7a993aa7"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'page_number': -42,\n",
" 'page_char_count': 29,\n",
" 'page_word_count': 4,\n",
" 'page_sentence_count_raw': 1,\n",
" 'page_token_count': 7.25,\n",
" 'text': 'Human Nutrition: 2020 Edition'},\n",
" {'page_number': -41,\n",
" 'page_char_count': 0,\n",
" 'page_word_count': 1,\n",
" 'page_sentence_count_raw': 1,\n",
" 'page_token_count': 0.0,\n",
" 'text': ''}]"
]
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"import fitz # PyMuPDF\n",
"from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm\n",
"\n",
"def text_formatter(text: str) -> str:\n",
" \"\"\"Performs minor formatting on text.\"\"\"\n",
" cleaned_text = text.replace(\"\\n\", \" \").strip() # this might be different for each doc\n",
"\n",
" # Other potential text formatting functions can go here\n",
" return cleaned_text\n",
"\n",
"# Open PDF and get lines/pages\n",
"# this only focuses on text\n",
"def open_and_read_pdf(pdf_path: str, page_offset: int = 0) -> list[dict]:\n",
" \"\"\"\n",
" Opens a PDF file, reads its text content page by page, and collects statistics.\n",
"\n",
" Parameters:\n",
" pdf_path (str): The file path to the PDF document to be opened and read.\n",
"\n",
" Returns:\n",
" list[dict]: A list of dictionaries, each containing the page number\n",
" (adjusted), character count, word count, sentence count, token count, and the extracted text\n",
" for each page.\n",
" \"\"\"\n",
" doc = fitz.open(pdf_path) # open a document\n",
" pages_and_texts = []\n",
" for page_number, page in tqdm(enumerate(doc)): # iterate the document pages\n",
" text = page.get_text() # get plain text encoded as UTF-8\n",
" text = text_formatter(text)\n",
" pages_and_texts.append({\"page_number\": page_number - page_offset, # adjust page numbers since our PDF starts on page 42\n",
" \"page_char_count\": len(text),\n",
" \"page_word_count\": len(text.split(\" \")),\n",
" \"page_sentence_count_raw\": len(text.split(\". \")),\n",
" \"page_token_count\": len(text) / 4, # 1 token = ~4 chars\n",
" \"text\": text})\n",
" return pages_and_texts\n",
"\n",
"pages_and_texts = open_and_read_pdf(pdf_path=pdf_path, page_offset=42)\n",
"pages_and_texts[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "drqYJA2cyGdR"
},
"source": [
"Now let's get a random sample of the pages."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "HTvdS7WryGdR",
"outputId": "70be460a-0317-4f1c-99cd-8e6fe995bea5",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'page_number': 591,\n",
" 'page_char_count': 313,\n",
" 'page_word_count': 52,\n",
" 'page_sentence_count_raw': 3,\n",
" 'page_token_count': 78.25,\n",
" 'text': 'recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \\xa0 An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=347 \\xa0 592 | Water-Soluble Vitamins'},\n",
" {'page_number': 1126,\n",
" 'page_char_count': 1533,\n",
" 'page_word_count': 218,\n",
" 'page_sentence_count_raw': 25,\n",
" 'page_token_count': 383.25,\n",
" 'text': 'an incessant fear of weight gain but instead have an obsession with “feeling pure, healthy and natural.”7 People affected by orthorexia nervosa tend to follow diets tied to a philosophy or theory and believe that their theory of eating is the best.8 9 Such diets often have a redemptive quality that involves denying oneself of “bad” or “wrong” foods.10 In extreme cases, affected individuals may also fear contamination or harm from water and electricity leading them to use filters to purify their environment from electrical emissions. 7.\\xa0Mathieu J. (2005). What is orthorexia? Journal of the American Dietetic Association,\\xa0105(10), 1510-1512. Bratman, S. Health Food Junkie. Yoga Journal. 1997,\\xa0 September/October, 42-50. Available at https://www.orthorexia.com/original-orthorexia- essay/. 8.\\xa0Donini LM, Marsili D, Graziani MP, Imbriale M, Cannella C. (2004). Orthorexia nervosa: a preliminary study with a proposal for diagnosis and an attempt to measure the dimension of the phenomenon. Eating and Weight Disorders,\\xa09(2), 151‐157. 9.\\xa0Orthorexia. (2017, February 26). National Eating Disorders Association. https://www.nationaleatingdisorders.org/learn/by- eating-disorder/other/orthorexia 10.\\xa0Mathieu J. (2005). What is orthorexia? Journal of the American Dietetic Association,\\xa0105(10), 1510-1512. Bratman, S. Health Food Junkie. Yoga Journal. 1997,\\xa0 September/October, 42-50. Available at https://www.orthorexia.com/original-orthorexia- essay/. Undernutrition, Overnutrition, and Malnutrition | 1127'},\n",
" {'page_number': 895,\n",
" 'page_char_count': 1535,\n",
" 'page_word_count': 250,\n",
" 'page_sentence_count_raw': 13,\n",
" 'page_token_count': 383.75,\n",
" 'text': 'There are a number of reasons behind this problem, including: • larger portion sizes • limited access to nutrient-rich foods • increased access to fast foods and vending machines • lack of breastfeeding support • declining physical education programs in schools • insufficient physical activity and a sedentary lifestyle • media messages encouraging the consumption of unhealthy foods Children who suffer from obesity are more likely to become overweight or obese adults. Obesity has a profound effect on self- esteem, energy, and activity level. Even more importantly, it is a major risk factor for a number of diseases later in life, including cardiovascular disease, Type 2 diabetes, stroke, hypertension, and certain cancers.6 A percentile for body mass index (BMI) specific to age and sex is used to determine if a child is overweight or obese. This is more appropriate than the BMI categories used for adults because the body composition of children varies as they develop, and differs between boys and girls. If a child gains weight inappropriate to growth, parents and caregivers should limit energy-dense, nutrient-poor snack foods. In addition, it is extremely beneficial to increase a child’s physical activity and limit sedentary activities, facts/epidemic-childhood-obesity. Accessed December 5, 2017. 6.\\xa0Obesity and Overweight Fact Sheet. World Health Organization. http://www.who.int/mediacentre/ factsheets/fs311/en/. Updated October 2017. Accessed November 29, 2017. 896 | Late Adolescence'}]"
]
},
"metadata": {},
"execution_count": 4
}
],
"source": [
"import random\n",
"\n",
"random.sample(pages_and_texts, k=3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UmXFEWhRyGdR"
},
"source": [
"### Get some stats on the text\n",
"\n",
"Let's perform a rough exploratory data analysis (EDA) to get an idea of the size of the texts (e.g. character counts, word counts etc) we're working with.\n",
"\n",
"The different sizes of texts will be a good indicator into how we should split our texts.\n",
"\n",
"For now, let's turn our list of dictionaries into a DataFrame and explore it."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "2dhR7orByGdR",
"outputId": "bee274c1-3c66-44a1-a7fe-eb81578a9105",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" page_number page_char_count page_word_count page_sentence_count_raw \\\n",
"0 -42 29 4 1 \n",
"1 -41 0 1 1 \n",
"2 -40 320 54 1 \n",
"3 -39 212 32 1 \n",
"4 -38 797 145 2 \n",
"\n",
" page_token_count text \n",
"0 7.25 Human Nutrition: 2020 Edition \n",
"1 0.00 \n",
"2 80.00 Human Nutrition: 2020 Edition UNIVERSITY OF ... \n",
"3 53.00 Human Nutrition: 2020 Edition by University of... \n",
"4 199.25 Contents Preface University of Hawai‘i at Mā... "
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" page_number | \n",
" page_char_count | \n",
" page_word_count | \n",
" page_sentence_count_raw | \n",
" page_token_count | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" -42 | \n",
" 29 | \n",
" 4 | \n",
" 1 | \n",
" 7.25 | \n",
" Human Nutrition: 2020 Edition | \n",
"
\n",
" \n",
" | 1 | \n",
" -41 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0.00 | \n",
" | \n",
"
\n",
" \n",
" | 2 | \n",
" -40 | \n",
" 320 | \n",
" 54 | \n",
" 1 | \n",
" 80.00 | \n",
" Human Nutrition: 2020 Edition UNIVERSITY OF ... | \n",
"
\n",
" \n",
" | 3 | \n",
" -39 | \n",
" 212 | \n",
" 32 | \n",
" 1 | \n",
" 53.00 | \n",
" Human Nutrition: 2020 Edition by University of... | \n",
"
\n",
" \n",
" | 4 | \n",
" -38 | \n",
" 797 | \n",
" 145 | \n",
" 2 | \n",
" 199.25 | \n",
" Contents Preface University of Hawai‘i at Mā... | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 1208,\n \"fields\": [\n {\n \"column\": \"page_number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 348,\n \"min\": -42,\n \"max\": 1165,\n \"num_unique_values\": 1208,\n \"samples\": [\n 59,\n 712,\n 266\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_char_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 560,\n \"min\": 0,\n \"max\": 2308,\n \"num_unique_values\": 883,\n \"samples\": [\n 1742,\n 1077,\n 1003\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_word_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 95,\n \"min\": 1,\n \"max\": 429,\n \"num_unique_values\": 349,\n \"samples\": [\n 352,\n 78,\n 88\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_sentence_count_raw\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6,\n \"min\": 1,\n \"max\": 32,\n \"num_unique_values\": 30,\n \"samples\": [\n 28,\n 24,\n 22\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_token_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 140.09556874248847,\n \"min\": 0.0,\n \"max\": 577.0,\n \"num_unique_values\": 883,\n \"samples\": [\n 435.5,\n 269.25,\n 250.75\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1180,\n \"samples\": [\n \"Electrolytes Important for Fluid Balance UNIVERSITY OF HAWAI\\u2018I AT M\\u0100NOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM Cells are about 75 percent water and blood plasma is about 95 percent water. Why then, does the water not flow from blood plasma to cells? The force of water also known as hydrostatic pressure maintains the volumes of water between fluid compartments against the force of all dissolved substances. The concentration is the amount of particles in a set volume of water. (Recall that individual solutes can differ in concentration between the intracellular and extracellular fluids, but the total concentration of all dissolved substances is equal.) The force driving the water movement through the selectively permeable membrane is the higher solute concentration on the one side. Solutes at different concentrations on either side of a selectively permeable membrane exert a force, called osmotic pressure. The higher concentration of solutes on one side compared to the other of the U-tube exerts osmotic pressure, pulling the water to a higher volume on the side of the U-tube containing more dissolved particles. When the osmotic pressure is equal to the pressure of the water on the selectively permeable membrane, net water movement stops (though it still diffuses back and forth at an equal rate). One equation exemplifying equal concentrations but different volumes is the following 5 grams of glucose in 1 liter = 10 grams of glucose in 2 liters (5g/L = 5g/L) The differences in concentrations of particular substances provide concentration gradients that cells can use to perform work. A concentration gradient is a form of potential energy, like water 172 | Electrolytes Important for Fluid Balance\",\n \"Units of Measure UNIVERSITY OF HAWAI\\u2018I AT M\\u0100NOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM In nutrition, there are two systems of commonly used measurements: Metric and US Customary. We need both because the US won\\u2019t adopt the metric system completely. The Metric and US Customary System These are commonly used prefixes for the Metric System: Micro- (\\u03bc) 1/1,000,000th (one millionth) Milli- (m) 1/1000th (one thousandth) Centi- (c) 1/100th (one hundredth) Deci- (d) 1/10th (one tenth) Kilo- (k) 1000x (one thousand times) Mass Metric System US Customary System Conversions Microgram (\\u03bcg) Ounce (oz) 1 oz = 28.35 g Milligram (mg) Pound (lb) 1 lb = 16 oz Gram (g) 1 lb = 454 g Kilogram (kg) 1 kg = 2.2 lbs 18 | Units of Measure\",\n \"activity level. For example, dental problems can lead to difficulties with chewing and swallowing, which in turn can make it hard to maintain a healthy diet. The use of dentures or the preparation of pureed or chopped foods can help solve this problem. There also is a decreased thirst response in the elderly, and the kidneys have a decreased ability to concentrate urine, both of which can lead to dehydration. Sensory Issues At about age sixty, taste buds begin to decrease in size and number. As a result, the taste threshold is higher in older adults, meaning that more of the same flavor must be present to detect the taste. Many elderly people lose the ability to distinguish between salty, sour, sweet, and bitter flavors. This can make food seem less appealing and decrease the appetite. An intake of foods high in sugar and sodium can increase due to an inability to discern those tastes. The sense of smell also decreases, which impacts attitudes toward food. Sensory issues may also affect the digestion because the taste and smell of food stimulates the secretion of digestive\\u00a0enzymes in the mouth, stomach, and pancreas. Dysphagia Some older adults have difficulty getting adequate nutrition because of the disorder dysphagia, which impairs the ability to swallow. Any damage to the parts of the brain that control swallowing can result in dysphagia, therefore stroke is a common cause. Dysphagia is also associated with advanced dementia because of overall brain function impairment. To assist older adults suffering from dysphagia, it can be helpful to alter food consistency. Older Adulthood: The Golden Years | 923\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 5
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.DataFrame(pages_and_texts)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "_2Mg-5k_yGdR",
"outputId": "bb5fbcc0-329e-4bec-baf6-26252f6e0be9",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" page_number page_char_count page_word_count page_sentence_count_raw \\\n",
"count 1208.00 1208.00 1208.00 1208.00 \n",
"mean 561.50 1148.00 198.30 9.97 \n",
"std 348.86 560.38 95.76 6.19 \n",
"min -42.00 0.00 1.00 1.00 \n",
"25% 259.75 762.00 134.00 4.00 \n",
"50% 561.50 1231.50 214.50 10.00 \n",
"75% 863.25 1603.50 271.00 14.00 \n",
"max 1165.00 2308.00 429.00 32.00 \n",
"\n",
" page_token_count \n",
"count 1208.00 \n",
"mean 287.00 \n",
"std 140.10 \n",
"min 0.00 \n",
"25% 190.50 \n",
"50% 307.88 \n",
"75% 400.88 \n",
"max 577.00 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" page_number | \n",
" page_char_count | \n",
" page_word_count | \n",
" page_sentence_count_raw | \n",
" page_token_count | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
"
\n",
" \n",
" | mean | \n",
" 561.50 | \n",
" 1148.00 | \n",
" 198.30 | \n",
" 9.97 | \n",
" 287.00 | \n",
"
\n",
" \n",
" | std | \n",
" 348.86 | \n",
" 560.38 | \n",
" 95.76 | \n",
" 6.19 | \n",
" 140.10 | \n",
"
\n",
" \n",
" | min | \n",
" -42.00 | \n",
" 0.00 | \n",
" 1.00 | \n",
" 1.00 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 25% | \n",
" 259.75 | \n",
" 762.00 | \n",
" 134.00 | \n",
" 4.00 | \n",
" 190.50 | \n",
"
\n",
" \n",
" | 50% | \n",
" 561.50 | \n",
" 1231.50 | \n",
" 214.50 | \n",
" 10.00 | \n",
" 307.88 | \n",
"
\n",
" \n",
" | 75% | \n",
" 863.25 | \n",
" 1603.50 | \n",
" 271.00 | \n",
" 14.00 | \n",
" 400.88 | \n",
"
\n",
" \n",
" | max | \n",
" 1165.00 | \n",
" 2308.00 | \n",
" 429.00 | \n",
" 32.00 | \n",
" 577.00 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"page_number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 439.1842002346103,\n \"min\": -42.0,\n \"max\": 1208.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 1208.0,\n 561.5,\n 863.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_char_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 692.7598126695862,\n \"min\": 0.0,\n \"max\": 2308.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1148.0,\n 1231.5,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_word_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 380.86345761027945,\n \"min\": 1.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 198.3,\n 214.5,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_sentence_count_raw\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 423.3006680160771,\n \"min\": 1.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 9.97,\n 10.0,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_token_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 373.38318073681745,\n \"min\": 0.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 287.0,\n 307.88,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 6
}
],
"source": [
"# Get stats\n",
"df.describe().round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jMNQgx4MyGdS"
},
"source": [
"Okay, looks like our average token count per page is 287.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VpMTeek-yGdS"
},
"source": [
"### Further text processing (splitting pages into sentences)\n",
"\n",
"The ideal is processing text before embedding.\n",
"\n",
"A simple method is to break the text into chunks of sentences.\n",
"\n",
"As in, chunk a page of text into groups of 5, 7, 10 or more sentences.\n",
"\n",
"We will follow the workflow of:\n",
"\n",
"`Ingest text -> split it into groups/chunks -> embed the groups/chunks -> use the embeddings`\n",
"\n",
"Some options for splitting text into sentences:\n",
"\n",
"1. Split into sentences with simple rules (e.g. split on \". \" with `text = text.split(\". \")`, like we did above).\n",
"2. Split into sentences with a natural language processing (NLP) library such as [spaCy](https://spacy.io/) or [nltk](https://www.nltk.org/).\n",
"\n",
"Why split into sentences?\n",
"\n",
"* Easier to handle than larger pages of text (especially if pages are densely filled with text).\n",
"* Can get specific and find out which group of sentences were used to help within a RAG pipeline.\n",
"\n",
"> **Resource:** See [spaCy install instructions](https://spacy.io/usage).\n",
"\n",
"Let's use spaCy to break our text into sentences since it's likely a bit more robust than just using `text.split(\". \")`."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "-yz9rHMEyGdS",
"outputId": "1105c552-197b-497f-c914-ab684bee7ba4",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[This is a sentence., This another sentence.]"
]
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"from spacy.lang.en import English # English language model\n",
"\n",
"nlp = English()\n",
"\n",
"# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/\n",
"nlp.add_pipe(\"sentencizer\")\n",
"\n",
"# Create a document instance as an example\n",
"doc = nlp(\"This is a sentence. This another sentence.\")\n",
"assert len(list(doc.sents)) == 2\n",
"\n",
"# Access the sentences of the document\n",
"list(doc.sents)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Kzn8irI5yGdT"
},
"source": [
"So let's run our small sentencizing pipeline on our pages of text."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "rzj7cThayGdT",
"outputId": "fd0ee782-8189-4a2b-b985-2196539e0609",
"colab": {
"referenced_widgets": [
"94cc52bff24e4c158e2859195a142498",
"a069ef0cde334591a5dc9cc3f2a5aacf",
"02e443fff37f4cceab1996ea4893dd3c",
"d76e01440712420eb7249bf8e6fc1aaa",
"f5809653a4a94e8f80452f1703dc47a5",
"6792e77105e54799bdd471eec04c7c5d",
"a84fdc3489ad4e78892906e5330e21f2",
"80941748295241d2b712292647b3c623",
"91bc9829243545008533e7a21bb44d8f",
"a2adca3f8a484ab48b859922f93f380d",
"86b0843d4fc4438fb312d40ad9447dcc"
],
"base_uri": "https://localhost:8080/",
"height": 49
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/1208 [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "94cc52bff24e4c158e2859195a142498"
}
},
"metadata": {}
}
],
"source": [
"for item in tqdm(pages_and_texts):\n",
" item[\"sentences\"] = list(nlp(item[\"text\"]).sents)\n",
"\n",
" # Make sure all sentences are strings\n",
" item[\"sentences\"] = [str(sentence) for sentence in item[\"sentences\"]]\n",
"\n",
" # Count the sentences\n",
" item[\"page_sentence_count_spacy\"] = len(item[\"sentences\"])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XSuwY1o4yGdT",
"outputId": "1fb87ef9-830e-4963-fde8-e3d8703f17d5",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'page_number': 578,\n",
" 'page_char_count': 1284,\n",
" 'page_word_count': 216,\n",
" 'page_sentence_count_raw': 8,\n",
" 'page_token_count': 321.0,\n",
" 'text': 'Image by Allison Calabrese / CC BY 4.0 Folate is especially essential for the growth and specialization of cells of the central nervous system. Children whose mothers were folate-deficient during pregnancy have a higher risk of neural-tube birth defects. Folate deficiency is causally linked to the development of spina bifida, a neural-tube defect that occurs when the spine does not completely enclose the spinal cord. Spina bifida can lead to many physical and mental disabilities (Figure 9.18\\xa0“Spina Bifida in Infants” ). Observational studies show that the prevalence of neural- tube defects was decreased after the fortification of enriched cereal grain products with folate in 1996 in the United States (and 1998 in Canada) compared to before grain products were fortified with folate. Additionally, results of clinical trials have demonstrated that neural-tube defects are significantly decreased in the offspring of mothers who began taking folate supplements one month prior to becoming pregnant and throughout the pregnancy. In response to the scientific evidence, the Food and Nutrition Board of the Institute of Medicine (IOM) raised the RDA for folate to 600 micrograms per day for pregnant women. Some were concerned Water-Soluble Vitamins | 579',\n",
" 'sentences': ['Image by Allison Calabrese / CC BY 4.0 Folate is especially essential for the growth and specialization of cells of the central nervous system.',\n",
" 'Children whose mothers were folate-deficient during pregnancy have a higher risk of neural-tube birth defects.',\n",
" 'Folate deficiency is causally linked to the development of spina bifida, a neural-tube defect that occurs when the spine does not completely enclose the spinal cord.',\n",
" 'Spina bifida can lead to many physical and mental disabilities (Figure 9.18\\xa0“Spina Bifida in Infants” ).',\n",
" 'Observational studies show that the prevalence of neural- tube defects was decreased after the fortification of enriched cereal grain products with folate in 1996 in the United States (and 1998 in Canada) compared to before grain products were fortified with folate.',\n",
" ' Additionally, results of clinical trials have demonstrated that neural-tube defects are significantly decreased in the offspring of mothers who began taking folate supplements one month prior to becoming pregnant and throughout the pregnancy.',\n",
" 'In response to the scientific evidence, the Food and Nutrition Board of the Institute of Medicine (IOM) raised the RDA for folate to 600 micrograms per day for pregnant women.',\n",
" 'Some were concerned Water-Soluble Vitamins | 579'],\n",
" 'page_sentence_count_spacy': 8}]"
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"# Inspect an example\n",
"random.sample(pages_and_texts, k=1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ci4M4fZ2yGdU"
},
"source": [
"Wonderful!\n",
"\n",
"Now let's turn out list of dictionaries into a DataFrame and get some stats."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "c8uMnhkqyGdU",
"outputId": "17753e0b-924b-4cf4-dff7-b6eae15909ab",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" page_number page_char_count page_word_count page_sentence_count_raw \\\n",
"count 1208.00 1208.00 1208.00 1208.00 \n",
"mean 561.50 1148.00 198.30 9.97 \n",
"std 348.86 560.38 95.76 6.19 \n",
"min -42.00 0.00 1.00 1.00 \n",
"25% 259.75 762.00 134.00 4.00 \n",
"50% 561.50 1231.50 214.50 10.00 \n",
"75% 863.25 1603.50 271.00 14.00 \n",
"max 1165.00 2308.00 429.00 32.00 \n",
"\n",
" page_token_count page_sentence_count_spacy \n",
"count 1208.00 1208.00 \n",
"mean 287.00 10.32 \n",
"std 140.10 6.30 \n",
"min 0.00 0.00 \n",
"25% 190.50 5.00 \n",
"50% 307.88 10.00 \n",
"75% 400.88 15.00 \n",
"max 577.00 28.00 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" page_number | \n",
" page_char_count | \n",
" page_word_count | \n",
" page_sentence_count_raw | \n",
" page_token_count | \n",
" page_sentence_count_spacy | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
"
\n",
" \n",
" | mean | \n",
" 561.50 | \n",
" 1148.00 | \n",
" 198.30 | \n",
" 9.97 | \n",
" 287.00 | \n",
" 10.32 | \n",
"
\n",
" \n",
" | std | \n",
" 348.86 | \n",
" 560.38 | \n",
" 95.76 | \n",
" 6.19 | \n",
" 140.10 | \n",
" 6.30 | \n",
"
\n",
" \n",
" | min | \n",
" -42.00 | \n",
" 0.00 | \n",
" 1.00 | \n",
" 1.00 | \n",
" 0.00 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 25% | \n",
" 259.75 | \n",
" 762.00 | \n",
" 134.00 | \n",
" 4.00 | \n",
" 190.50 | \n",
" 5.00 | \n",
"
\n",
" \n",
" | 50% | \n",
" 561.50 | \n",
" 1231.50 | \n",
" 214.50 | \n",
" 10.00 | \n",
" 307.88 | \n",
" 10.00 | \n",
"
\n",
" \n",
" | 75% | \n",
" 863.25 | \n",
" 1603.50 | \n",
" 271.00 | \n",
" 14.00 | \n",
" 400.88 | \n",
" 15.00 | \n",
"
\n",
" \n",
" | max | \n",
" 1165.00 | \n",
" 2308.00 | \n",
" 429.00 | \n",
" 32.00 | \n",
" 577.00 | \n",
" 28.00 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"page_number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 439.1842002346103,\n \"min\": -42.0,\n \"max\": 1208.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 1208.0,\n 561.5,\n 863.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_char_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 692.7598126695862,\n \"min\": 0.0,\n \"max\": 2308.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1148.0,\n 1231.5,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_word_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 380.86345761027945,\n \"min\": 1.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 198.3,\n 214.5,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_sentence_count_raw\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 423.3006680160771,\n \"min\": 1.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 9.97,\n 10.0,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_token_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 373.38318073681745,\n \"min\": 0.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 287.0,\n 307.88,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_sentence_count_spacy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 423.405400861363,\n \"min\": 0.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 10.32,\n 10.0,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 10
}
],
"source": [
"df = pd.DataFrame(pages_and_texts)\n",
"df.describe().round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4lZJbailyGdU"
},
"source": [
"For our set of text, it looks like our raw sentence count (e.g. splitting on `\". \"`) is quite close to what spaCy came up with.\n",
"\n",
"Now we've got our text split into sentences, lets group those sentences!"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wAFfalXJyGdU"
},
"source": [
"### Chunking our sentences together\n",
"\n",
"Let's take a step to break down our list of sentences/text into smaller chunks.\n",
"\n",
"This process is referred to as **chunking**.\n",
"\n",
"Why do we do this?\n",
"\n",
"1. Easier to manage similar sized chunks of text.\n",
"2. Don't overload the embedding models capacity for tokens (e.g. if an embedding model has a capacity of 384 tokens, there could be information loss if we try to embed a sequence of 400+ tokens).\n",
"3. Our LLM context window (the amount of tokens an LLM can take in) may be limited and requires compute power so we want to make sure we're using it as well as possible.\n",
"\n",
"There are many different ways emerging for creating chunks of information/text.\n",
"\n",
"For now, we're going to keep it simple and break our pages of sentences into groups of 10 (this number is arbitrary and can be changed).\n",
"\n",
"On average each of our pages has 10 sentences.\n",
"\n",
"And an average total of 287 tokens per page.\n",
"\n",
"So our groups of 10 sentences will also be ~287 tokens long.\n",
"\n",
"To split our groups of sentences into chunks of 10 or less, let's create a function which accepts a list as input and recursively breaks into down into sublists of a specified size."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"id": "lgx5Dy_ayGdU",
"outputId": "0c2cfa69-c02e-4f22-930a-1ca34df52705",
"colab": {
"referenced_widgets": [
"1016a3c98b6448b7a11852633587ab3f",
"74b707712fce45cd98a16b71908916db",
"494fc75e07d14ca794c9b152e100953e",
"20c69b545a6d4bbb86344abfea24da58",
"8ffe48cbd52245ca849696746b9dec74",
"711f9bc8e4fb4e5c90aced0384b4cb54",
"67e2aafea7754c23906a8b35e9bd2f08",
"2e86c19b91f2419e9eb181e105dfb3b0",
"5a03c35e76ca47c5a973023980862c76",
"58766a63fc7f4188b41ac32b386769f4",
"1ce1ca10b5b0434aa17417b63db75ab4"
],
"base_uri": "https://localhost:8080/",
"height": 49
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/1208 [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "1016a3c98b6448b7a11852633587ab3f"
}
},
"metadata": {}
}
],
"source": [
"# Define split size to turn groups of sentences into chunks\n",
"num_sentence_chunk_size = 10\n",
"\n",
"# Create a function that recursively splits a list into desired sizes\n",
"def split_list(input_list: list,\n",
" slice_size: int) -> list[list[str]]:\n",
" \"\"\"\n",
" Splits the input_list into sublists of size slice_size (or as close as possible).\n",
"\n",
" For example, a list of 17 sentences would be split into two lists of [[10], [7]]\n",
" \"\"\"\n",
" return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]\n",
"\n",
"# Loop through pages and texts and split sentences into chunks\n",
"for item in tqdm(pages_and_texts):\n",
" item[\"sentence_chunks\"] = split_list(input_list=item[\"sentences\"],\n",
" slice_size=num_sentence_chunk_size)\n",
" item[\"num_chunks\"] = len(item[\"sentence_chunks\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "DU0Z43XayGdU",
"outputId": "eb12fd92-90c1-4141-efa0-87a3be6a8b80",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'page_number': 684,\n",
" 'page_char_count': 297,\n",
" 'page_word_count': 51,\n",
" 'page_sentence_count_raw': 3,\n",
" 'page_token_count': 74.25,\n",
" 'text': 'recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \\xa0 An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=393 \\xa0 Iodine | 685',\n",
" 'sentences': ['recommended that users complete these activities using a desktop or laptop computer and in Google Chrome.',\n",
" ' \\xa0 An interactive or media element has been excluded from this version of the text.',\n",
" 'You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=393 \\xa0 Iodine | 685'],\n",
" 'page_sentence_count_spacy': 3,\n",
" 'sentence_chunks': [['recommended that users complete these activities using a desktop or laptop computer and in Google Chrome.',\n",
" ' \\xa0 An interactive or media element has been excluded from this version of the text.',\n",
" 'You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=393 \\xa0 Iodine | 685']],\n",
" 'num_chunks': 1}]"
]
},
"metadata": {},
"execution_count": 12
}
],
"source": [
"# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)\n",
"random.sample(pages_and_texts, k=1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "yOlOwJj2yGdU",
"outputId": "b24e5521-38c8-436d-857c-fdc044a938f0",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" page_number page_char_count page_word_count page_sentence_count_raw \\\n",
"count 1208.00 1208.00 1208.00 1208.00 \n",
"mean 561.50 1148.00 198.30 9.97 \n",
"std 348.86 560.38 95.76 6.19 \n",
"min -42.00 0.00 1.00 1.00 \n",
"25% 259.75 762.00 134.00 4.00 \n",
"50% 561.50 1231.50 214.50 10.00 \n",
"75% 863.25 1603.50 271.00 14.00 \n",
"max 1165.00 2308.00 429.00 32.00 \n",
"\n",
" page_token_count page_sentence_count_spacy num_chunks \n",
"count 1208.00 1208.00 1208.00 \n",
"mean 287.00 10.32 1.53 \n",
"std 140.10 6.30 0.64 \n",
"min 0.00 0.00 0.00 \n",
"25% 190.50 5.00 1.00 \n",
"50% 307.88 10.00 1.00 \n",
"75% 400.88 15.00 2.00 \n",
"max 577.00 28.00 3.00 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" page_number | \n",
" page_char_count | \n",
" page_word_count | \n",
" page_sentence_count_raw | \n",
" page_token_count | \n",
" page_sentence_count_spacy | \n",
" num_chunks | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
" 1208.00 | \n",
"
\n",
" \n",
" | mean | \n",
" 561.50 | \n",
" 1148.00 | \n",
" 198.30 | \n",
" 9.97 | \n",
" 287.00 | \n",
" 10.32 | \n",
" 1.53 | \n",
"
\n",
" \n",
" | std | \n",
" 348.86 | \n",
" 560.38 | \n",
" 95.76 | \n",
" 6.19 | \n",
" 140.10 | \n",
" 6.30 | \n",
" 0.64 | \n",
"
\n",
" \n",
" | min | \n",
" -42.00 | \n",
" 0.00 | \n",
" 1.00 | \n",
" 1.00 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 25% | \n",
" 259.75 | \n",
" 762.00 | \n",
" 134.00 | \n",
" 4.00 | \n",
" 190.50 | \n",
" 5.00 | \n",
" 1.00 | \n",
"
\n",
" \n",
" | 50% | \n",
" 561.50 | \n",
" 1231.50 | \n",
" 214.50 | \n",
" 10.00 | \n",
" 307.88 | \n",
" 10.00 | \n",
" 1.00 | \n",
"
\n",
" \n",
" | 75% | \n",
" 863.25 | \n",
" 1603.50 | \n",
" 271.00 | \n",
" 14.00 | \n",
" 400.88 | \n",
" 15.00 | \n",
" 2.00 | \n",
"
\n",
" \n",
" | max | \n",
" 1165.00 | \n",
" 2308.00 | \n",
" 429.00 | \n",
" 32.00 | \n",
" 577.00 | \n",
" 28.00 | \n",
" 3.00 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"page_number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 439.1842002346103,\n \"min\": -42.0,\n \"max\": 1208.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 1208.0,\n 561.5,\n 863.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_char_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 692.7598126695862,\n \"min\": 0.0,\n \"max\": 2308.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1148.0,\n 1231.5,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_word_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 380.86345761027945,\n \"min\": 1.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 198.3,\n 214.5,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_sentence_count_raw\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 423.3006680160771,\n \"min\": 1.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 9.97,\n 10.0,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_token_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 373.38318073681745,\n \"min\": 0.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 287.0,\n 307.88,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page_sentence_count_spacy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 423.405400861363,\n \"min\": 0.0,\n \"max\": 1208.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 10.32,\n 10.0,\n 1208.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"num_chunks\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 426.6303015471525,\n \"min\": 0.0,\n \"max\": 1208.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 1208.0,\n 1.53,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 13
}
],
"source": [
"# Create a DataFrame to get stats\n",
"df = pd.DataFrame(pages_and_texts)\n",
"df.describe().round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "eb9k-vsbyGdV"
},
"source": [
"The average number of chunks is around 1.5, this is expected since many of our pages only contain an average of 10 sentences."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tgcgENBHyGdV"
},
"source": [
"### Splitting each chunk into its own item\n",
"\n",
"We'd like to embed each chunk of sentences into its own numerical representation.\n",
"\n",
"Let's create a new list of dictionaries each containing a single chunk of sentences with relative information such as page number as well statistics about each chunk."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "UFSxO5iXyGdV",
"outputId": "b7611c07-724f-4d86-8dd8-694533f56955",
"colab": {
"referenced_widgets": [
"b19f39b1fd314d71885b15154debce11",
"9f9948c49b5e4a5e87631d4137bdca3e",
"4eaeda1e6d94473cb39ca9ad6204b607",
"2cf3513015d243e7842f6120f939a706",
"2f2f1c2e4df346cbb7b6775e0aa0c55a",
"ebd0595644334e02abfbafd31264196e",
"6ac351005e7247f391c684e620744259",
"fe005c591c2448fb8f3f4f0bd5cc0bb1",
"980ae4efbcb64f758e1311e4e2b0cad0",
"14c9b952454c40b59325b63be4cc2d7e",
"5a646e132031418d8cefea0a99f22444"
],
"base_uri": "https://localhost:8080/",
"height": 66
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/1208 [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "b19f39b1fd314d71885b15154debce11"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1843"
]
},
"metadata": {},
"execution_count": 14
}
],
"source": [
"import re\n",
"\n",
"# Split each chunk into its own item\n",
"pages_and_chunks = []\n",
"for item in tqdm(pages_and_texts):\n",
" for sentence_chunk in item[\"sentence_chunks\"]:\n",
" chunk_dict = {}\n",
" chunk_dict[\"page_number\"] = item[\"page_number\"]\n",
"\n",
" # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)\n",
" joined_sentence_chunk = \"\".join(sentence_chunk).replace(\" \", \" \").strip()\n",
" joined_sentence_chunk = re.sub(r'\\.([A-Z])', r'. \\1', joined_sentence_chunk) # \".A\" -> \". A\" for any full-stop/capital letter combo\n",
" chunk_dict[\"sentence_chunk\"] = joined_sentence_chunk\n",
"\n",
" # Get stats about the chunk\n",
" chunk_dict[\"chunk_char_count\"] = len(joined_sentence_chunk)\n",
" chunk_dict[\"chunk_word_count\"] = len([word for word in joined_sentence_chunk.split(\" \")])\n",
" chunk_dict[\"chunk_token_count\"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters\n",
"\n",
" pages_and_chunks.append(chunk_dict)\n",
"\n",
"# How many chunks do we have?\n",
"len(pages_and_chunks)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "9XZYLeXWyGdV",
"outputId": "0b21ba26-2e7e-4a4e-b22c-383414eb1937",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'page_number': 803,\n",
" 'sentence_chunk': 'and eclampsia, which is sometimes referred to as toxemia of pregnancy. This disorder is marked by elevated blood pressure and protein in the urine and is associated with swelling. To prevent preeclampsia, the WHO recommends increasing calcium intake for women consuming diets low in that micronutrient, administering a low dosage of aspirin (75 milligrams), and increasing prenatal checkups. The WHO does not recommend the restriction of dietary salt intake during pregnancy with the aim of preventing the development of pre-eclampsia and its complications12. About 4 percent of pregnant women suffer from a condition known as gestational diabetes, which is abnormal glucose tolerance during pregnancy. The body becomes resistant to the hormone insulin, which enables cells to transport glucose from the blood. Gestational diabetes is usually diagnosed around twenty-four to twenty-six weeks, although it is possible for the condition to develop later into a pregnancy. Signs and symptoms of this disease include extreme hunger, thirst, or fatigue. If blood sugar levels are not properly monitored and treated, the baby might gain too much weight and require a cesarean delivery. Diet and regular physical activity can help to manage this condition.',\n",
" 'chunk_char_count': 1249,\n",
" 'chunk_word_count': 189,\n",
" 'chunk_token_count': 312.25}]"
]
},
"metadata": {},
"execution_count": 15
}
],
"source": [
"# View a random sample\n",
"random.sample(pages_and_chunks, k=1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ME1zoPDryGdV"
},
"source": [
"Excellent!\n",
"\n",
"Now we've broken our whole textbook into chunks of 10 sentences or less as well as the page number they came from.\n",
"\n",
"This means we could reference a chunk of text and know its source.\n",
"\n",
"Let's get some stats about our chunks."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"id": "lzeLIECnyGdV",
"outputId": "b923a384-39be-4435-9bad-735064d37d54",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" page_number chunk_char_count chunk_word_count chunk_token_count\n",
"count 1843.00 1843.00 1843.00 1843.00\n",
"mean 582.38 734.44 112.33 183.61\n",
"std 347.79 447.54 71.22 111.89\n",
"min -42.00 12.00 3.00 3.00\n",
"25% 279.50 315.00 44.00 78.75\n",
"50% 585.00 746.00 114.00 186.50\n",
"75% 889.00 1118.50 173.00 279.62\n",
"max 1165.00 1831.00 297.00 457.75"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" page_number | \n",
" chunk_char_count | \n",
" chunk_word_count | \n",
" chunk_token_count | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 1843.00 | \n",
" 1843.00 | \n",
" 1843.00 | \n",
" 1843.00 | \n",
"
\n",
" \n",
" | mean | \n",
" 582.38 | \n",
" 734.44 | \n",
" 112.33 | \n",
" 183.61 | \n",
"
\n",
" \n",
" | std | \n",
" 347.79 | \n",
" 447.54 | \n",
" 71.22 | \n",
" 111.89 | \n",
"
\n",
" \n",
" | min | \n",
" -42.00 | \n",
" 12.00 | \n",
" 3.00 | \n",
" 3.00 | \n",
"
\n",
" \n",
" | 25% | \n",
" 279.50 | \n",
" 315.00 | \n",
" 44.00 | \n",
" 78.75 | \n",
"
\n",
" \n",
" | 50% | \n",
" 585.00 | \n",
" 746.00 | \n",
" 114.00 | \n",
" 186.50 | \n",
"
\n",
" \n",
" | 75% | \n",
" 889.00 | \n",
" 1118.50 | \n",
" 173.00 | \n",
" 279.62 | \n",
"
\n",
" \n",
" | max | \n",
" 1165.00 | \n",
" 1831.00 | \n",
" 297.00 | \n",
" 457.75 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"page_number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 589.9857395234833,\n \"min\": -42.0,\n \"max\": 1843.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 582.38,\n 585.0,\n 1843.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk_char_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 674.7972839940261,\n \"min\": 12.0,\n \"max\": 1843.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 734.44,\n 746.0,\n 1843.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk_word_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 616.9738147936844,\n \"min\": 3.0,\n \"max\": 1843.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 112.33,\n 114.0,\n 1843.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk_token_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 601.8910558517096,\n \"min\": 3.0,\n \"max\": 1843.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 183.61,\n 186.5,\n 1843.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"# Get stats about our chunks\n",
"df = pd.DataFrame(pages_and_chunks)\n",
"df.describe().round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Z5TKYZ7hyGdV"
},
"source": [
"Looks like some of our chunks have quite a low token count.\n",
"\n",
"Let's check for samples with less than 30 tokens (about the length of a sentence) and see if they are worth keeping?"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"id": "mCw52c7myGdY",
"outputId": "56bd6e25-eb30-4185-a93c-120930f90edc",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Chunk token count: 24.75 | Text: http://www.ajcn.org/content/87/1/64.long. Accessed September 22, 2017. 554 | Water-Soluble Vitamins\n",
"Chunk token count: 26.5 | Text: It is stored in the rectum until it is expelled through the anus via defecation. The Digestive System | 77\n",
"Chunk token count: 11.0 | Text: Accessed October 5, 2017. Introduction | 433\n",
"Chunk token count: 19.25 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=519 Introduction | 991\n",
"Chunk token count: 17.75 | Text: Table 6.1 Essential and Nonessential Amino Acids Defining Protein | 365\n"
]
}
],
"source": [
"# Show random chunks with under 30 tokens in length\n",
"min_token_length = 30\n",
"for row in df[df[\"chunk_token_count\"] <= min_token_length].sample(5).iterrows():\n",
" print(f'Chunk token count: {row[1][\"chunk_token_count\"]} | Text: {row[1][\"sentence_chunk\"]}')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5N_hTluhyGdY"
},
"source": [
"Looks like many of these are headers and footers of different pages.\n",
"\n",
"They don't seem to offer too much information.\n",
"\n",
"Let's filter our DataFrame/list of dictionaries to only include chunks with over 30 tokens in length."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"id": "-6d8UvjryGdZ",
"outputId": "54d6477c-1746-40e3-876b-5e8e8fa6988d",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'page_number': -40,\n",
" 'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',\n",
" 'chunk_char_count': 308,\n",
" 'chunk_word_count': 42,\n",
" 'chunk_token_count': 77.0},\n",
" {'page_number': -39,\n",
" 'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',\n",
" 'chunk_char_count': 210,\n",
" 'chunk_word_count': 30,\n",
" 'chunk_token_count': 52.5}]"
]
},
"metadata": {},
"execution_count": 18
}
],
"source": [
"pages_and_chunks_over_min_token_len = df[df[\"chunk_token_count\"] > min_token_length].to_dict(orient=\"records\")\n",
"pages_and_chunks_over_min_token_len[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0PdbYyR5yGdZ"
},
"source": [
"Smaller chunks filtered!\n",
"\n",
"Time to embed our chunks of text!"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GlljGim2yGdZ"
},
"source": [
"### Embedding our text chunks\n",
"\n",
"While humans understand text, machines understand numbers best.\n",
"\n",
"An [embedding](https://vickiboykis.com/what_are_embeddings/index.html) is a broad concept.\n",
"\n",
"A simple definitions is \"a useful numerical representation\".\n",
"\n",
"The most powerful thing about modern embeddings is that they are *learned* representations.\n",
"\n",
"Meaning rather than directly mapping words/tokens/characters to numbers directly (e.g. `{\"a\": 0, \"b\": 1, \"c\": 3...}`), the numerical representation of tokens is learned by going through large corpuses of text and figuring out how different tokens relate to each other.\n",
"\n",
"Our goal is to turn each of our chunks into a numerical representation (an embedding vector, where a vector is a sequence of numbers arranged in order).\n",
"\n",
"\n",
"To do so, we'll use the [Cohere](https://cohere.com/embed) embedding model.\n",
"\n",
"Specifically, we'll get the `embed-english-v2.0` model (you can see the model's intended use on the [Model](https://docs.cohere.com/reference/embed)).\n",
"\n",
"Upload these vector embeddings into [Pinecone](https://docs.pinecone.io/guides/get-started/quickstart)."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"id": "v3XzdfNXyGda"
},
"outputs": [],
"source": [
"# Turn text chunks into a single list\n",
"text_chunks = [item[\"sentence_chunk\"] for item in pages_and_chunks_over_min_token_len]"
]
},
{
"cell_type": "code",
"source": [
"COHERE_KEY = 'mMJW7g9UDwQCFhtW905hK854aQJPoU13cRevsrvg'"
],
"metadata": {
"id": "K17nl6ERMbUM"
},
"execution_count": 20,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"#### Create Embeddings"
],
"metadata": {
"id": "BUjgmxH3o_2v"
}
},
{
"cell_type": "code",
"source": [
"import cohere\n",
"\n",
"co = cohere.Client(COHERE_KEY)"
],
"metadata": {
"id": "4Wuu7-biMwXw"
},
"execution_count": 21,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%%time\n",
"\n",
"# Embed all texts\n",
"embeds = co.embed(\n",
" texts=text_chunks,\n",
" model='embed-english-v2.0',\n",
" input_type='search_query',\n",
" truncate='END'\n",
").embeddings"
],
"metadata": {
"id": "psNUNLrnM7HG",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "aa4272c6-5f34-415c-a581-9fcc3da44bce"
},
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU times: user 46.6 s, sys: 1.16 s, total: 47.8 s\n",
"Wall time: 52.9 s\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"Check the dimensionality of the returned vectors. We will need to save the embedding dimensionality from this to be used when initializing your Pinecone index later"
],
"metadata": {
"id": "1j_vH4J7pdjP"
}
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"\n",
"shape = np.array(embeds).shape\n",
"shape"
],
"metadata": {
"id": "KaYqJ8XnN4IN",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3cc1e958-535b-40bd-aca6-bec61a0b5c1b"
},
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(1680, 4096)"
]
},
"metadata": {},
"execution_count": 23
}
]
},
{
"cell_type": "markdown",
"source": [
"We can see the 4096 embedding dimensionality produced by Cohere’s `embed-english-v2.0` model, and the 1680 samples we built embeddings for."
],
"metadata": {
"id": "2oy7c9CmptEZ"
}
},
{
"cell_type": "markdown",
"source": [
"#### Store the embeddings"
],
"metadata": {
"id": "XUUyEyOJpNT2"
}
},
{
"cell_type": "markdown",
"source": [
"Now that we have our embeddings, we can move on to indexing them in the Pinecone vector database."
],
"metadata": {
"id": "mZU5ruxlp8kA"
}
},
{
"cell_type": "markdown",
"source": [
"We first initialize our connection to Pinecone and then create a new index called cohere-pinecone for storing the embeddings. When creating the index, we specify that we would like to use the cosine similarity metric to align with Cohere’s embeddings, and also pass the embedding dimensionality of 4096."
],
"metadata": {
"id": "NyqtOoawqHx4"
}
},
{
"cell_type": "code",
"source": [
"from pinecone import Pinecone, ServerlessSpec\n",
"import os\n",
"\n",
"# Use the API key to initialize the Pinecone client\n",
"pc = Pinecone(api_key='bdb5ea29-449c-4e3d-8075-a0898d1b8404')\n",
"\n",
"index_name = 'cohere-pinecone'\n",
"\n",
"# if the index does not exist, we create it\n",
"if index_name not in pc.list_indexes().names():\n",
" pc.create_index(\n",
" name=index_name,\n",
" dimension=shape[1],\n",
" metric=\"cosine\",\n",
" spec=ServerlessSpec(\n",
" cloud='aws',\n",
" region='us-east-1'\n",
" )\n",
" )\n",
"\n",
"# connect to index\n",
"index = pc.Index(index_name)"
],
"metadata": {
"id": "Ddena3FORyK8"
},
"execution_count": 24,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now we can begin populating the index with our embeddings. Pinecone expects us to provide a list of tuples in the format (*id, vector, metadata*), where the metadata field is an optional extra field where we can store anything we want in a dictionary format. For this example, we will store the original text of the embeddings."
],
"metadata": {
"id": "bE51nIMaqfcb"
}
},
{
"cell_type": "markdown",
"source": [
"While uploading our data, we will batch everything to avoid pushing too much data in one go."
],
"metadata": {
"id": "8iOr_78fqto0"
}
},
{
"cell_type": "code",
"source": [
"batch_size = 128\n",
"\n",
"ids = [str(i) for i in range(shape[0])]\n",
"# create list of metadata dictionaries\n",
"meta = [{'text': text} for text in text_chunks]\n",
"\n",
"# create list of (id, vector, metadata) tuples to be upserted\n",
"to_upsert = list(zip(ids, embeds, meta))\n",
"\n",
"for i in range(0, shape[0], batch_size):\n",
" i_end = min(i+batch_size, shape[0])\n",
" index.upsert(vectors=to_upsert[i:i_end])\n",
"\n",
"# let's view the index statistics\n",
"index.describe_index_stats()"
],
"metadata": {
"id": "2nJPhPSfSkLD",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "01a010bf-59a5-49d6-b0d2-e6789380e6ef"
},
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'dimension': 4096,\n",
" 'index_fullness': 0.0,\n",
" 'namespaces': {'': {'vector_count': 1680}},\n",
" 'total_vector_count': 1680}"
]
},
"metadata": {},
"execution_count": 25
}
]
},
{
"cell_type": "markdown",
"source": [
"We can see from `index.describe_index_stats()` that we have a 4096-dimensionality index populated with 1680 embeddings. Note that serverless indexes scale automatically as needed, so the index_fullness metric is relevant only for pod-based indexes."
],
"metadata": {
"id": "BJBAU1Bdqz90"
}
},
{
"cell_type": "markdown",
"source": [
"### Semantic Search\n",
"\n",
"Now that we have our indexed vectors, we can perform a few search queries. When searching, we will first embed our query using Cohere, and then search using the returned vector in Pinecone."
],
"metadata": {
"id": "0_rRJN4-rIYZ"
}
},
{
"cell_type": "code",
"source": [
"# Functionising the semantic search\n",
"import textwrap # for wrapping text\n",
"\n",
"def search_queries(queries: list[str], k: int = 1) -> dict:\n",
" \"\"\"\n",
" Function to embed multiple queries, search in Pinecone, and return the top-k results.\n",
"\n",
" Args:\n",
" - queries (list): A list of query strings.\n",
" - k (int): The number of top results to retrieve for each query (default is 1).\n",
"\n",
" Returns:\n",
" - results (dict): A dictionary where each query maps to its top-k results.\n",
" \"\"\"\n",
" # Step 1: Create embeddings for all queries\n",
" query_embeddings = co.embed(\n",
" texts=queries,\n",
" model='embed-english-v2.0',\n",
" input_type='search_query',\n",
" truncate='END'\n",
" ).embeddings\n",
"\n",
" # Step 2: Perform Pinecone search for each query embedding\n",
" all_results = {}\n",
"\n",
" for i, query_embedding in enumerate(query_embeddings):\n",
" # Query Pinecone index with each query embedding\n",
" res = index.query(vector=query_embedding, top_k=k, include_metadata=True)\n",
"\n",
" # Store the result for each query (as a list of matches)\n",
" all_results[queries[i]] = res['matches']\n",
"\n",
" # Step 3: Display results for each query\n",
" wrapper = textwrap.TextWrapper(width=80)\n",
" for query, matches in all_results.items():\n",
" print(f\"Results for Query: {query}\\n\")\n",
"\n",
" # Iterate over the top-k matches\n",
" for match in matches:\n",
" score = match['score']\n",
" text = match['metadata']['text']\n",
"\n",
" # Wrap the text to fit within 80 characters per line\n",
" wrapped_text = wrapper.fill(text=text)\n",
"\n",
" # Print the score and corresponding text in a readable format\n",
" print(f\"Score: {score:.2f}\")\n",
" print(\"Document:\\n\")\n",
" print(f\"{wrapped_text}\\n{'-'*50}\\n\") # Divider to separate results\n",
"\n",
" # Larger divider between different queries\n",
" print(f\"\\n{'='*100}\\n\")\n"
],
"metadata": {
"id": "6zW58XIHvSU8"
},
"execution_count": 26,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Let's Look our Result"
],
"metadata": {
"id": "5z59tIj9sFdM"
}
},
{
"cell_type": "code",
"source": [
"result = search_queries(queries=[\"macro nutrients\"],\n",
" k=1)\n",
"result"
],
"metadata": {
"id": "OkoiXD78YdE0",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "0d361fea-587f-43b6-af67-3155e0c9f12b"
},
"execution_count": 27,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Results for Query: macro nutrients\n",
"\n",
"Score: 0.63\n",
"Document:\n",
"\n",
"Macronutrients Nutrients that are needed in large amounts are called\n",
"macronutrients. There are three classes of macronutrients: carbohydrates,\n",
"lipids, and proteins. These can be metabolically processed into cellular energy.\n",
"The energy from macronutrients comes from their chemical bonds. This chemical\n",
"energy is converted into cellular energy that is then utilized to perform work,\n",
"allowing our bodies to conduct their basic functions. A unit of measurement of\n",
"food energy is the calorie. On nutrition food labels the amount given for\n",
"“calories” is actually equivalent to each calorie multiplied by one thousand. A\n",
"kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with\n",
"the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a\n",
"macronutrient in the sense that you require a large amount of it, but unlike the\n",
"other macronutrients, it does not yield calories. Carbohydrates Carbohydrates\n",
"are molecules composed of carbon, hydrogen, and oxygen.\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### Several example queries"
],
"metadata": {
"id": "gwa0gATQ7eKE"
}
},
{
"cell_type": "markdown",
"source": [
"Multiple-queries"
],
"metadata": {
"id": "yXFSlz3E9Xp8"
}
},
{
"cell_type": "code",
"source": [
"result = search_queries(queries=[\"what is carbohydrates\", \"what is fats\", \"What is Starch\"],\n",
" k=1)\n",
"result"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bURHakxd7rEV",
"outputId": "2985986f-f657-41f0-a48c-2b44f65069c1"
},
"execution_count": 33,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Results for Query: what is carbohydrates\n",
"\n",
"Score: 0.62\n",
"Document:\n",
"\n",
"Carbohydrat es are broken down into the subgroups simple and complex\n",
"carbohydrate s. These subgroups are further categorized into mono-, di-, and\n",
"polysacchari des. indigestible carbohydrates provide a good amount of fiber with\n",
"a host of other health benefits. Plants synthesize the fast-releasing\n",
"carbohydrate, glucose, from carbon dioxide in the air and water, and by\n",
"harnessing the sun’s energy. Recall that plants convert the energy in sunlight\n",
"to chemical energy in the molecule, glucose. Plants use glucose to make other\n",
"larger, more slow-releasing carbohydrates. When we eat plants we harvest the\n",
"energy of glucose to support life’s processes. Figure 4.1 Carbohydrate\n",
"Classification Scheme Carbohydrates are a group of organic compounds containing\n",
"a ratio of one carbon atom to two hydrogen atoms to one oxygen atom. Basically,\n",
"they are hydrated carbons. The word “carbo” means carbon and “hydrate” means\n",
"water. Glucose, the most abundant carbohydrate in the human body, has six carbon\n",
"atoms, twelve hydrogen atoms, and six oxygen atoms.\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: what is fats\n",
"\n",
"Score: 0.59\n",
"Document:\n",
"\n",
"scarce. Our ability to store excess caloric energy as fat for future usage\n",
"allowed us to continue as a species during these times of famine. So, normal fat\n",
"reserves are a signal that metabolic processes are efficient and a person is\n",
"healthy. Lipids are a family of organic compounds that are mostly insoluble in\n",
"water. Composed of fats and oils, lipids are molecules that yield high energy\n",
"and have a chemical composition mainly of carbon, hydrogen, and oxygen. Lipids\n",
"perform three primary biological functions within the body: they serve as\n",
"structural components of cell membranes, function as energy storehouses, and\n",
"function as important signaling molecules. The three main types of lipids are\n",
"triglycerides, phospholipids, and sterols. Triglycerides make up more than 95\n",
"percent of lipids in the diet and are commonly found in fried foods, vegetable\n",
"oil, butter, whole milk, cheese, cream cheese, and some meats. Naturally\n",
"occurring triglycerides are found in many foods, including avocados, olives,\n",
"corn, and nuts. We commonly call the triglycerides in our food “fats” and\n",
"“oils.”\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: What is Starch\n",
"\n",
"Score: 0.53\n",
"Document:\n",
"\n",
"Complex/Slow-Releasing Carbohydrates Complex carbohydrates are polysaccharides,\n",
"long chains of monosaccharides that may be branched or not branched. There are\n",
"two main groups of polysaccharides: starches and fibers. Starches Starch\n",
"molecules are found in abundance in grains, legumes, and root vegetables, such\n",
"as potatoes. Amylose, a plant starch, is a linear chain containing hundreds of\n",
"glucose units. Amylopectin, another plant starch, is a branched chain containing\n",
"thousands of glucose units. These large starch molecules form crystals and are\n",
"the energy-storing molecules of plants. These two starch molecules (amylose and\n",
"amylopectin) are contained together in foods, but the smaller one, amylose, is\n",
"less abundant. Eating raw foods containing starches provides very little energy\n",
"as the digestive system has a hard time breaking them down. Cooking breaks down\n",
"the crystal structure of starches, making them much easier to break down in the\n",
"human body. The starches that remain intact throughout digestion are called\n",
"resistant starches.\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"result = search_queries(queries=[\"what are fat-soluble vitamins?\", \"What are the causes of type 2 diabetes?\"],\n",
" k=1)\n",
"result"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "i0VgFvI5787K",
"outputId": "8bb51c7f-744e-43ec-9b6d-b6fd414400f7"
},
"execution_count": 32,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Results for Query: what are fat-soluble vitamins?\n",
"\n",
"Score: 0.67\n",
"Document:\n",
"\n",
"subcutaneous fat, or fat underneath the skin. This blanket layer of tissue\n",
"insulates the body from extreme temperatures and helps keep the internal climate\n",
"under control. It pads our hands and buttocks and prevents friction, as these\n",
"areas frequently come in contact with hard surfaces. It also gives the body the\n",
"extra padding required when engaging in physically demanding activities such as\n",
"ice- or roller skating, horseback riding, or snowboarding. Aiding Digestion and\n",
"Increasing Bioavailability The dietary fats in the foods we eat break down in\n",
"our digestive systems and begin the transport of precious micronutrients. By\n",
"carrying fat-soluble nutrients through the digestive process, intestinal\n",
"absorption is improved. This improved absorption is also known as increased\n",
"bioavailability. Fat-soluble nutrients are especially important for good health\n",
"and exhibit a variety of functions. Vitamins A, D, E, and K—the fat-soluble\n",
"vitamins—are mainly found in foods containing fat. Some fat-soluble vitamins\n",
"(such as vitamin A) are also found in naturally fat-free foods such as green\n",
"leafy vegetables, carrots, and broccoli.\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: What are the causes of type 2 diabetes?\n",
"\n",
"Score: 0.76\n",
"Document:\n",
"\n",
"To see how the rise in obesity in this country is paralleled by the rise in Type\n",
"2 diabetes, review this report by the CDC.\n",
"https://www.cdc.gov/diabetes/statistics/slides/ maps_diabetesobesity_trends.pdf \n",
"What is the causal relationship between overnutrition and Type 2 diabetes?The\n",
"prevailing theory is that the overconsumption of high-fat and high-sugar foods\n",
"causes changes in muscle, fat, and liver cells that leads to a diminished\n",
"response from the pancreatic hormone insulin. These cells are called “insulin-\n",
"resistant.”Insulin is released after a meal and instructs the liver and other\n",
"tissues to take up glucose and fatty acids that are circulating in the blood.\n",
"When cells are resistant to insulin they do not take up enough glucose and fatty\n",
"acids, so glucose and fatty acids remain at high concentrations in the blood.\n",
"The chronic elevation of glucose and fatty acids in the blood also causes damage\n",
"to other tissues over time, so that people who have Type 2 diabetes are at\n",
"increased risk for cardiovascular disease, kidney disease, nerve damage, and eye\n",
"disease. The Endocrine System | 107\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"result = search_queries(queries=[\"What is the importance of hydration for physical performance?\", \"What role does fibre play in digestion?\", \"What is the RDA for protein per day?\"],\n",
" k=1)\n",
"result"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b6C1DkwI859s",
"outputId": "6ea955e4-df86-4b9c-e396-a95316583fb0"
},
"execution_count": 37,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Results for Query: What is the importance of hydration for physical performance?\n",
"\n",
"Score: 0.68\n",
"Document:\n",
"\n",
"Image by Allison Calabrese / CC BY 4.0 Water and Electrolyte Needs UNIVERSITY OF\n",
"HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION\n",
"PROGRAM During exercise, being appropriately hydrated contributes to\n",
"performance. Water is needed to cool the body, transport oxygen and nutrients,\n",
"and remove waste products from the muscles. Water needs are increased during\n",
"exercise due to the extra water losses through evaporation and sweat.\n",
"Dehydration can occur when there is inadequate water levels in the body and can\n",
"be very hazardous to the health of an individual. As the severity of dehydration\n",
"increases, the exercise performance of an individual will begin to decline (see\n",
"Figure 16.9 “Dehydration Effect on Exercise Performance”). It is important to\n",
"continue to consume water before, during and after exercise to avoid dehydration\n",
"as much as possible. Figure 16.9 Dehydration Effect on Exercise Performance\n",
"During exercise, thirst is not a reliable short term indicator of the body’s\n",
"needs as it typically is not enough to replace the water loss. Even with the\n",
"constant replenishing of water throughout an 972 | Water and Electrolyte Needs\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: What role does fibre play in digestion?\n",
"\n",
"Score: 0.62\n",
"Document:\n",
"\n",
"Image by Allison Calabrese / CC BY 4.0 fiber intake because of what the\n",
"breakdown products of the fiber do for the colon. The bacterial breakdown of\n",
"fiber in the large intestine releases short-chain fatty acids. These molecules\n",
"have been found to nourish colonic cells, inhibit colonic inflammation, and\n",
"stimulate the immune system (thereby providing protection of the colon from\n",
"harmful substances). Additionally, the bacterial indigestible fiber, mostly\n",
"insoluble, increases stool bulk and softness increasing transit time in the\n",
"large intestine and facilitating feces elimination. One phenomenon of consuming\n",
"foods high in fiber is increased gas, since the byproducts of bacterial\n",
"digestion of fiber are gases. Figure 18.2 Diverticulitis: A Disease of Fiber\n",
"Deficiency Some studies have found a link between high dietary-fiber intake and\n",
"a decreased risk for colon cancer. However an analysis of 1086 | Nutrition,\n",
"Health and Disease\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: What is the RDI for protein per day?\n",
"\n",
"Score: 0.66\n",
"Document:\n",
"\n",
"Proteins, Diet, and Personal Choices UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE\n",
"AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM We have discussed what\n",
"proteins are, how they are made, how they are digested and absorbed, the many\n",
"functions of proteins in the body, and the consequences of having too little or\n",
"too much protein in the diet. This section will provide you with information on\n",
"how to determine the recommended amount of protein for you, and your many\n",
"choices in designing an optimal diet with high-quality protein sources. How Much\n",
"Protein Does a Person Need in Their Diet? The recommendations set by the IOM for\n",
"the Recommended Daily Allowance (RDA) and AMDR for protein for different age\n",
"groups are listed in Table 6.2 “Dietary Reference Intakes for Protein”. A\n",
"Tolerable Upper Intake Limit for protein has not been set, but it is recommended\n",
"that you do not exceed the upper end of the AMDR. Table 6.2 Dietary Reference\n",
"Intakes for Protein Proteins, Diet, and Personal Choices | 409\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"result = search_queries(queries=[\"what are other health benefits of Calcium in the body?\", \"define weight gain during pregnancy?\", \"How does saliva help with digestion?\"],\n",
" k=1)\n",
"result"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZlEQT2_M-FYO",
"outputId": "e4471aef-2c64-44fb-83c9-fbca73ed0abd"
},
"execution_count": 50,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Results for Query: what are other health benefits of Calcium in the body?\n",
"\n",
"Score: 0.78\n",
"Document:\n",
"\n",
"Image by Allison Calabrese / CC BY 4.0 Other Health Benefits of Calcium in the\n",
"Body Besides forming and maintaining strong bones and teeth, calcium has been\n",
"shown to have other health benefits for the body, including: • Cancer. The\n",
"National Cancer Institute reports that there is enough scientific evidence to\n",
"conclude that higher intakes of calcium decrease colon cancer risk and may\n",
"suppress the growth of polyps that often precipitate cancer. Although higher\n",
"calcium consumption protects against colon cancer, some studies have looked at\n",
"the relationship between calcium and prostate cancer and found higher intakes\n",
"may increase the risk for prostate cancer; however the data is inconsistent and\n",
"more studies are needed to confirm any negative association. • Blood pressure.\n",
"Multiple studies provide clear evidence that higher calcium consumption reduces\n",
"blood pressure. A review of twenty-three observational studies concluded that\n",
"for every Calcium | 615\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: define weight gain during pregnancy?\n",
"\n",
"Score: 0.69\n",
"Document:\n",
"\n",
"oranges. Additionally, since 1998, food manufacturers have been required to add\n",
"folate to cereals and other grain products.2 Weight Gain during Pregnancy During\n",
"pregnancy, a mother’s body changes in many ways. One of the most notable and\n",
"significant changes is weight gain. If a pregnant woman does not gain enough\n",
"weight, her unborn baby will be at risk. Poor weight gain, especially in the\n",
"second and third trimesters, could result not only in low birth weight, but also\n",
"infant mortality and intellectual disabilities. Therefore, it is vital for a\n",
"pregnant woman to maintain a healthy amount of weight gain. Her weight prior to\n",
"pregnancy also has a major effect. Infant birth weight is one of the best\n",
"indicators of a baby’s future health. Pregnant women of normal prepregnancy\n",
"weight should gain between 25 and 35 pounds in total through the entire\n",
"pregnancy. The precise amount that a mother should gain usually depends on her\n",
"beginning weight or body mass index (BMI).\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: How does saliva help with digestion?\n",
"\n",
"Score: 0.54\n",
"Document:\n",
"\n",
"Digestion and Absorption of Carbohydrates UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD\n",
"SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM From the Mouth\n",
"to the Stomach The mechanical and chemical digestion of carbohydrates begins in\n",
"the mouth. Chewing, also known as mastication, crumbles the carbohydrate foods\n",
"into smaller and smaller pieces. The salivary glands in the oral cavity secrete\n",
"saliva that coats the food particles. Saliva contains the enzyme, salivary\n",
"amylase. This enzyme breaks the bonds between the monomeric sugar units of\n",
"disaccharides, oligosaccharides, and starches. The salivary amylase breaks down\n",
"amylose and amylopectin into smaller chains of glucose, called dextrins and\n",
"maltose. The increased concentration of maltose in the mouth that results from\n",
"the mechanical and chemical breakdown of starches in whole grains is what\n",
"enhances their sweetness. Only about five percent of starches are broken down in\n",
"the mouth. (This is a good thing as more glucose in the mouth would lead to more\n",
"tooth decay.)When carbohydrates reach the stomach no further chemical breakdown\n",
"occurs because the amylase enzyme does not function in the acidic conditions of\n",
"the stomach.\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"result = search_queries(queries=[\"How often should infants be breastfed??\", \"what is water soluble vitamins\", \"What are symptoms of pellagra?\"],\n",
" k=1)\n",
"result"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JQkVBet9_wI3",
"outputId": "ec3a34c9-55b7-44e7-b94c-93810239e022"
},
"execution_count": 51,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Results for Query: How often should infants be breastfed??\n",
"\n",
"Score: 0.57\n",
"Document:\n",
"\n",
"milk is the best source to fulfill nutritional requirements. An exclusively\n",
"breastfed infant does not even need extra water, including in hot climates. A\n",
"newborn infant (birth to 28 days) requires feedings eight to twelve times a day\n",
"or more. Between 1 and 3 months of age, the breastfed infant becomes more\n",
"efficient, and the number of feedings per day often become fewer even though the\n",
"amount of milk consumed stays the same. After about six months, infants can\n",
"gradually begin to consume solid foods to help meet nutrient needs. Foods that\n",
"are added in addition to breastmilk are called complementary foods.\n",
"Complementary foods should be nutrient dense to provide optimal nutrition.\n",
"Complementary foods include baby meats, vegetables, fruits, infant cereal, and\n",
"dairy products such as yogurt, but not infant formula. Infant formula is a\n",
"substitute, not a complement to breastmilk. In addition to complementary foods,\n",
"the World Health Organization recommends that breastfeeding continue up to 2\n",
"years of age or beyond, and the American Academy of Pediatrics recommends at\n",
"least one year of breastfeeding, or longer.\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: what is water soluble vitamins\n",
"\n",
"Score: 0.64\n",
"Document:\n",
"\n",
"Water-Soluble Vitamins UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN\n",
"NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM All water-soluble vitamins play a\n",
"different kind of role in energy metabolism; they are required as functional\n",
"parts of enzymes involved in energy release and storage. Vitamins and minerals\n",
"that make up part of enzymes are referred to as coenzymes and cofactors,\n",
"respectively. Coenzymes and cofactors are required by enzymes to catalyze a\n",
"specific reaction. They assist in converting a substrate to an end-product.\n",
"Coenzymes and cofactors are essential in catabolic pathways and play a role in\n",
"many anabolic pathways too. In addition to being essential for metabolism, many\n",
"vitamins and minerals are required for blood renewal and function. At\n",
"insufficient levels in the diet these vitamins and minerals impair the health of\n",
"blood and consequently the delivery of nutrients in and wastes out, amongst its\n",
"many other functions. In this section we will focus on the vitamins that take\n",
"part in metabolism and blood function and renewal. Figure 9.7 Enzyme Active Site\n",
"for Cofactors 550 | Water-Soluble Vitamins\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n",
"Results for Query: What are symptoms of pellagra?\n",
"\n",
"Score: 0.59\n",
"Document:\n",
"\n",
"Niacin deficiency is commonly known as pellagra and the symptoms include\n",
"fatigue, decreased appetite, and indigestion. These symptoms are then commonly\n",
"followed by the four D’s: diarrhea, dermatitis, dementia, and sometimes death.\n",
"Figure 9.12 Conversion of Tryptophan to Niacin Water-Soluble Vitamins | 565\n",
"--------------------------------------------------\n",
"\n",
"\n",
"====================================================================================================\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"At end lets delete our Pinecone index."
],
"metadata": {
"id": "jYUgY2F1sIui"
}
},
{
"cell_type": "code",
"source": [
"pc.delete_index(index_name)"
],
"metadata": {
"id": "Ywo6nYAfV4hv"
},
"execution_count": 52,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
},
"colab": {
"provenance": [],
"toc_visible": true,
"gpuType": "T4"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"662d657066044012ba174d1a7a993aa7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_b123cfb995154c5982344f77c7d97118",
"IPY_MODEL_28e55837d06840efbe1a70b768d165b2",
"IPY_MODEL_5fe003b2e9f841b8b1f34bdcce4569bd"
],
"layout": "IPY_MODEL_2bc69c558d664d5194582deb344b3ae6"
}
},
"b123cfb995154c5982344f77c7d97118": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_474dfac0c13d49e0a603f73369dcc42e",
"placeholder": "",
"style": "IPY_MODEL_0d2acff2bdd54e8b9fddfaf685144ae7",
"value": ""
}
},
"28e55837d06840efbe1a70b768d165b2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_cbf9ece884a24a7db323757c719a5ae8",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_924c014aca3741aa808b705aa66864ed",
"value": 1
}
},
"5fe003b2e9f841b8b1f34bdcce4569bd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b72b7ebaaa0744699ba2e577a4911b59",
"placeholder": "",
"style": "IPY_MODEL_9a3b5eddeed443aba5908ad2f4e3b588",
"value": " 1208/? [00:05<00:00, 148.14it/s]"
}
},
"2bc69c558d664d5194582deb344b3ae6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"474dfac0c13d49e0a603f73369dcc42e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0d2acff2bdd54e8b9fddfaf685144ae7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"cbf9ece884a24a7db323757c719a5ae8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "20px"
}
},
"924c014aca3741aa808b705aa66864ed": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"b72b7ebaaa0744699ba2e577a4911b59": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9a3b5eddeed443aba5908ad2f4e3b588": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"94cc52bff24e4c158e2859195a142498": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_a069ef0cde334591a5dc9cc3f2a5aacf",
"IPY_MODEL_02e443fff37f4cceab1996ea4893dd3c",
"IPY_MODEL_d76e01440712420eb7249bf8e6fc1aaa"
],
"layout": "IPY_MODEL_f5809653a4a94e8f80452f1703dc47a5"
}
},
"a069ef0cde334591a5dc9cc3f2a5aacf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6792e77105e54799bdd471eec04c7c5d",
"placeholder": "",
"style": "IPY_MODEL_a84fdc3489ad4e78892906e5330e21f2",
"value": "100%"
}
},
"02e443fff37f4cceab1996ea4893dd3c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_80941748295241d2b712292647b3c623",
"max": 1208,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_91bc9829243545008533e7a21bb44d8f",
"value": 1208
}
},
"d76e01440712420eb7249bf8e6fc1aaa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a2adca3f8a484ab48b859922f93f380d",
"placeholder": "",
"style": "IPY_MODEL_86b0843d4fc4438fb312d40ad9447dcc",
"value": " 1208/1208 [00:03<00:00, 441.08it/s]"
}
},
"f5809653a4a94e8f80452f1703dc47a5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6792e77105e54799bdd471eec04c7c5d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"a84fdc3489ad4e78892906e5330e21f2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"80941748295241d2b712292647b3c623": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"91bc9829243545008533e7a21bb44d8f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"a2adca3f8a484ab48b859922f93f380d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"86b0843d4fc4438fb312d40ad9447dcc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"1016a3c98b6448b7a11852633587ab3f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_74b707712fce45cd98a16b71908916db",
"IPY_MODEL_494fc75e07d14ca794c9b152e100953e",
"IPY_MODEL_20c69b545a6d4bbb86344abfea24da58"
],
"layout": "IPY_MODEL_8ffe48cbd52245ca849696746b9dec74"
}
},
"74b707712fce45cd98a16b71908916db": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_711f9bc8e4fb4e5c90aced0384b4cb54",
"placeholder": "",
"style": "IPY_MODEL_67e2aafea7754c23906a8b35e9bd2f08",
"value": "100%"
}
},
"494fc75e07d14ca794c9b152e100953e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_2e86c19b91f2419e9eb181e105dfb3b0",
"max": 1208,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_5a03c35e76ca47c5a973023980862c76",
"value": 1208
}
},
"20c69b545a6d4bbb86344abfea24da58": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_58766a63fc7f4188b41ac32b386769f4",
"placeholder": "",
"style": "IPY_MODEL_1ce1ca10b5b0434aa17417b63db75ab4",
"value": " 1208/1208 [00:00<00:00, 44330.97it/s]"
}
},
"8ffe48cbd52245ca849696746b9dec74": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"711f9bc8e4fb4e5c90aced0384b4cb54": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"67e2aafea7754c23906a8b35e9bd2f08": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"2e86c19b91f2419e9eb181e105dfb3b0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5a03c35e76ca47c5a973023980862c76": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"58766a63fc7f4188b41ac32b386769f4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1ce1ca10b5b0434aa17417b63db75ab4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"b19f39b1fd314d71885b15154debce11": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_9f9948c49b5e4a5e87631d4137bdca3e",
"IPY_MODEL_4eaeda1e6d94473cb39ca9ad6204b607",
"IPY_MODEL_2cf3513015d243e7842f6120f939a706"
],
"layout": "IPY_MODEL_2f2f1c2e4df346cbb7b6775e0aa0c55a"
}
},
"9f9948c49b5e4a5e87631d4137bdca3e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_ebd0595644334e02abfbafd31264196e",
"placeholder": "",
"style": "IPY_MODEL_6ac351005e7247f391c684e620744259",
"value": "100%"
}
},
"4eaeda1e6d94473cb39ca9ad6204b607": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_fe005c591c2448fb8f3f4f0bd5cc0bb1",
"max": 1208,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_980ae4efbcb64f758e1311e4e2b0cad0",
"value": 1208
}
},
"2cf3513015d243e7842f6120f939a706": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_14c9b952454c40b59325b63be4cc2d7e",
"placeholder": "",
"style": "IPY_MODEL_5a646e132031418d8cefea0a99f22444",
"value": " 1208/1208 [00:00<00:00, 12366.60it/s]"
}
},
"2f2f1c2e4df346cbb7b6775e0aa0c55a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ebd0595644334e02abfbafd31264196e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6ac351005e7247f391c684e620744259": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"fe005c591c2448fb8f3f4f0bd5cc0bb1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"980ae4efbcb64f758e1311e4e2b0cad0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"14c9b952454c40b59325b63be4cc2d7e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5a646e132031418d8cefea0a99f22444": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}