Spaces:

ymath
/

RAG-QA-PDF

Runtime error

App Files Files Community

whymath commited on May 2, 2024

Commit

e997a79

1 Parent(s): 1bea032

Adding base files for RAQA prototype notebook and chainlit app

Browse files

Files changed (6) hide show

.gitignore +2 -0
Dockerfile +11 -0
QA_PDF_LangChain.ipynb +471 -0
app.py +42 -0
requirements.txt +13 -0
utils.py +86 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

+wandb/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

QA_PDF_LangChain.ipynb ADDED Viewed

	@@ -0,0 +1,471 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dependencies and Initial Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: numpy in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (1.26.4)\n",
+      "Requirement already satisfied: langchain in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.1.17)\n",
+      "Requirement already satisfied: langchain-core in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.1.48)\n",
+      "Requirement already satisfied: langchain-community in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.0.36)\n",
+      "Requirement already satisfied: langchain-openai in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.1.5)\n",
+      "Requirement already satisfied: qdrant-client in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (1.9.0)\n",
+      "Requirement already satisfied: tiktoken in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.6.0)\n",
+      "Requirement already satisfied: pymupdf in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (1.24.2)\n",
+      "Requirement already satisfied: wandb in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.16.6)\n",
+      "Requirement already satisfied: PyYAML>=5.3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (6.0.1)\n",
+      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (2.0.29)\n",
+      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (3.9.5)\n",
+      "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (0.5.14)\n",
+      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (1.33)\n",
+      "Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (0.0.1)\n",
+      "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (0.1.52)\n",
+      "Requirement already satisfied: pydantic<3,>=1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (2.7.1)\n",
+      "Requirement already satisfied: requests<3,>=2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (2.31.0)\n",
+      "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (8.2.3)\n",
+      "Requirement already satisfied: packaging<24.0,>=23.2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain-core) (23.2)\n",
+      "Requirement already satisfied: openai<2.0.0,>=1.10.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain-openai) (1.25.0)\n",
+      "Requirement already satisfied: grpcio>=1.41.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (1.63.0)\n",
+      "Requirement already satisfied: grpcio-tools>=1.41.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (1.62.2)\n",
+      "Requirement already satisfied: httpx>=0.20.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (0.27.0)\n",
+      "Requirement already satisfied: portalocker<3.0.0,>=2.7.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (2.8.2)\n",
+      "Requirement already satisfied: urllib3<3,>=1.26.14 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (2.2.1)\n",
+      "Requirement already satisfied: regex>=2022.1.18 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from tiktoken) (2024.4.28)\n",
+      "Requirement already satisfied: PyMuPDFb==1.24.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from pymupdf) (1.24.1)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (8.1.7)\n",
+      "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (3.1.43)\n",
+      "Requirement already satisfied: psutil>=5.0.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (5.9.8)\n",
+      "Requirement already satisfied: sentry-sdk>=1.0.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (2.0.1)\n",
+      "Requirement already satisfied: docker-pycreds>=0.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (0.4.0)\n",
+      "Requirement already satisfied: setproctitle in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (1.3.3)\n",
+      "Requirement already satisfied: setuptools in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (69.5.1)\n",
+      "Requirement already satisfied: appdirs>=1.4.3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (1.4.4)\n",
+      "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (4.25.3)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
+      "Requirement already satisfied: colorama in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from Click!=8.0.0,>=7.1->wandb) (0.4.6)\n",
+      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.2)\n",
+      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
+      "Requirement already satisfied: six>=1.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+      "Requirement already satisfied: gitdb<5,>=4.0.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.11)\n",
+      "Requirement already satisfied: anyio in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (3.7.1)\n",
+      "Requirement already satisfied: certifi in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (2024.2.2)\n",
+      "Requirement already satisfied: httpcore==1.* in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (1.0.5)\n",
+      "Requirement already satisfied: idna in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (3.7)\n",
+      "Requirement already satisfied: sniffio in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (1.3.1)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpcore==1.*->httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (0.14.0)\n",
+      "Requirement already satisfied: h2<5,>=3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (4.1.0)\n",
+      "Requirement already satisfied: jsonpointer>=1.9 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
+      "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.2)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from openai<2.0.0,>=1.10.0->langchain-openai) (1.9.0)\n",
+      "Requirement already satisfied: tqdm>4 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from openai<2.0.0,>=1.10.0->langchain-openai) (4.66.2)\n",
+      "Requirement already satisfied: typing-extensions<5,>=4.7 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from openai<2.0.0,>=1.10.0->langchain-openai) (4.11.0)\n",
+      "Requirement already satisfied: pywin32>=226 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from portalocker<3.0.0,>=2.7.0->qdrant-client) (306)\n",
+      "Requirement already satisfied: annotated-types>=0.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
+      "Requirement already satisfied: pydantic-core==2.18.2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from pydantic<3,>=1->langchain) (2.18.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
+      "Requirement already satisfied: greenlet!=0.4.17 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
+      "Requirement already satisfied: smmap<6,>=3.0.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n",
+      "Requirement already satisfied: hyperframe<7,>=6.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (6.0.1)\n",
+      "Requirement already satisfied: hpack<5,>=4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (4.0.0)\n",
+      "Requirement already satisfied: mypy-extensions>=0.3.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -U numpy langchain langchain-core langchain-community langchain-openai qdrant-client tiktoken pymupdf wandb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mymath\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.6"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>d:\\Workspaces\\Courses\\AIMakerspace\\RAG-QA-PDF\\wandb\\run-20240502_020421-r0mtht4l</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l' target=\"_blank\">desert-dream-4</a></strong> to <a href='https://wandb.ai/ymath/QA_PDF_LangChain' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/ymath/QA_PDF_LangChain' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+      ],
+      "text/plain": [
+       "<wandb.sdk.wandb_run.Run at 0x20a437086b0>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import wandb\n",
+    "import getpass\n",
+    "\n",
+    "# UNCOMMENT TO ENTER WANDB KEY INTERACTIVELY\n",
+    "# wandb_key = getpass.getpass(\"Weights and Biases API Key: \")\n",
+    "# os.environ[\"WANDB_API_KEY\"] = wandb_key\n",
+    "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"./QA_PDF_LangChain.ipynb\"\n",
+    "wandb.init(project=\"QA_PDF_LangChain\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# UNCOMMENT TO ENTER OPENAI KEY INTERACTIVELY\n",
+    "# os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Vector Store with Source Documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tiktoken\n",
+    "\n",
+    "def tiktoken_len(text):\n",
+    "    tokens = tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode(\n",
+    "        text,\n",
+    "    )\n",
+    "    return len(tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 147 documents\n",
+      "page_content='UNITED STATES\\nSECURITIES AND EXCHANGE COMMISSION\\nWashington, D.C.\\xa020549\\n__________________________\\nFORM 10-K\\n__________________________\\n(Mark One)\\n☒\\xa0\\xa0\\xa0\\xa0ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d)\\xa0OF THE SECURITIES EXCHANGE ACT OF 1934\\nFor the fiscal year ended December\\xa031, 2023\\nor\\n☐\\xa0\\xa0\\xa0\\xa0TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d)\\xa0OF THE SECURITIES EXCHANGE ACT OF 1934\\nFor the transition period from\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0to\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\nCommission File Number:\\xa0001-35551\\n__________________________\\nMeta Platforms, Inc.\\n(Exact name of registrant as specified in its charter)\\n__________________________\\nDelaware\\n20-1665019\\n(State or other jurisdiction of incorporation or organization)\\n(I.R.S. Employer Identification Number)\\n1 Meta Way, Menlo Park, California 94025\\n(Address of principal executive offices and Zip Code)\\n(650)\\xa0543-4800\\n(Registrant\\'s telephone number, including area code)\\n__________________________\\nSecurities registered pursuant to Section 12(b) of the Act:\\nTitle of each class\\nTrading symbol(s)\\nName of each exchange on which registered\\nClass A Common Stock, $0.000006 par value\\nMETA\\nThe Nasdaq Stock Market LLC\\nSecurities registered pursuant to Section 12(g) of the Act: None\\nIndicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☒\\xa0\\xa0No\\xa0\\xa0 ☐\\nIndicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.\\xa0\\xa0\\xa0\\xa0Yes \\xa0☐\\xa0No\\xa0 ☒\\nIndicate by check mark whether the registrant\\xa0(1)\\xa0has filed all reports required to be filed by Section\\xa013 or 15(d) of the Securities Exchange Act of 1934 (Exchange Act) during the preceding\\n12\\xa0months (or for such shorter period that the registrant was required to file such reports), and\\xa0(2)\\xa0has been subject to such filing requirements for the past 90\\xa0days.\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☒\\xa0\\xa0\\xa0\\xa0No\\xa0\\xa0☐\\nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§\\xa0232.405 of this chapter)\\nduring the preceding 12 months (or for such shorter period that the registrant was required to submit such files).\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☒\\xa0\\xa0\\xa0\\xa0No\\xa0\\xa0☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See the definitions\\nof \"large accelerated filer,\" \"accelerated filer,\" \"smaller reporting company,\" and \"emerging growth company\" in Rule 12b-2 of the Exchange Act.\\nLarge accelerated filer\\n☒\\nAccelerated\\xa0filer\\n☐\\nNon-accelerated filer\\n☐\\nSmaller\\xa0reporting\\xa0company\\n☐\\nEmerging growth company\\n☐\\nIf an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or revised financial accounting standards\\nprovided pursuant to Section 13(a) of the Exchange Act. ☐\\nIndicate by check mark whether the registrant has filed a report on and attestation to its management\\'s assessment of the effectiveness of its internal control over financial reporting under Section\\n404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit report. ☒\\nIf securities are registered pursuant to Section 12(b) of the Act, indicate by check mark whether the financial statements of the registrant included in the filing reflect the correction of an error to\\npreviously issued financial statements. ☐\\nIndicate by check mark whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the registrant’s executive officers\\nduring the relevant recovery period pursuant to §240.10D-1(b). ☐\\nIndicate by check mark whether the registrant is a shell company (as defined in Rule 12b-2 of the Exchange Act).\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☐\\xa0\\xa0\\xa0\\xa0No\\xa0\\xa0 ☒\\nThe aggregate market value of the voting and non-voting stock held by non-affiliates of the registrant as of June\\xa030, 2023, the last business day of the registrant\\'s most recently completed second fiscal\\nquarter, was $637\\xa0billion based upon the closing price reported for such date on the Nasdaq Global Select Market. On January\\xa026, 2024, the registrant had 2,200,048,907 shares of Class\\xa0A common\\nstock and 349,356,199 shares of Class B common stock outstanding.\\n' metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'file_path': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'page': 0, 'total_pages': 147, 'format': 'PDF 1.4', 'title': '0001326801-24-000012', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K filed on 2024-02-02 for the period ending 2023-12-31', 'keywords': '0001326801-24-000012; ; 10-K', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240202060356-05'00'\", 'modDate': \"D:20240202060413-05'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.document_loaders import PyMuPDFLoader\n",
+    "\n",
+    "# docs = PyMuPDFLoader(\"data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf\").load()\n",
+    "docs = PyMuPDFLoader(\"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf\").load()\n",
+    "\n",
+    "print(\"Loaded\", len(docs), \"documents\")\n",
+    "print(docs[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "663"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "text_splitter = RecursiveCharacterTextSplitter(\n",
+    "    chunk_size = 200,\n",
+    "    chunk_overlap = 0,\n",
+    "    length_function = tiktoken_len,\n",
+    ")\n",
+    "\n",
+    "split_chunks = text_splitter.split_documents(docs)\n",
+    "\n",
+    "len(split_chunks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai.embeddings import OpenAIEmbeddings\n",
+    "\n",
+    "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.vectorstores import Qdrant\n",
+    "\n",
+    "qdrant_vectorstore = Qdrant.from_documents(\n",
+    "    split_chunks,\n",
+    "    embedding_model,\n",
+    "    location=\":memory:\",\n",
+    "    collection_name=\"Meta 10-k Filings\",\n",
+    ")\n",
+    "\n",
+    "qdrant_retriever = qdrant_vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "RAG_PROMPT = \"\"\"\n",
+    "CONTEXT:\n",
+    "{context}\n",
+    "\n",
+    "QUERY:\n",
+    "{question}\n",
+    "\n",
+    "Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with \"I don't know\".\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "openai_chat_model = ChatOpenAI(model=\"gpt-3.5-turbo\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from operator import itemgetter\n",
+    "# from langchain.schema.output_parser import StrOutputParser\n",
+    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "\n",
+    "retrieval_augmented_qa_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
+    "    | {\"response\": rag_prompt | openai_chat_model, \"context\": itemgetter(\"context\")}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test Outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $65.40 billion.\""
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "question_txt = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n",
+    "response = retrieval_augmented_qa_chain.invoke({\"question\" : question_txt})\n",
+    "response[\"response\"].content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The Directors of Meta, as mentioned in the provided context, are Peggy Alford, Marc L. Andreessen, Andrew W. Houston, Nancy Killefer, Robert M. Kimmitt, Sheryl K. Sandberg, Tracey T. Travis, and Tony Xu.'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "question_txt = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n",
+    "response = retrieval_augmented_qa_chain.invoke({\"question\" : question_txt})\n",
+    "response[\"response\"].content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">desert-dream-4</strong> at: <a href='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l</a><br/> View project at: <a href='https://wandb.ai/ymath/QA_PDF_LangChain' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain</a><br/>Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>.\\wandb\\run-20240502_020421-r0mtht4l\\logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "wandb.finish()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# You can find this code for Chainlit python streaming here (https://docs.chainlit.io/concepts/streaming/python)
+# OpenAI Chat completion
+import os
+from openai import AsyncOpenAI  # importing openai for API usage
+import chainlit as cl  # importing chainlit for our app
+from chainlit.prompt import Prompt, PromptMessage  # importing prompt tools
+from chainlit.playground.providers import ChatOpenAI  # importing ChatOpenAI tools
+from dotenv import load_dotenv
+import utils
+load_dotenv()
+@cl.on_chat_start
+async def start_chat():
+    raqa_chain = utils.create_raqa_chain_from_docs()
+    settings = {
+        "chain": raqa_chain
+    }
+    cl.user_session.set("settings", settings)
+@cl.on_message
+async def main(message: cl.Message):
+    # Print the message content
+    user_query = message.content
+    print('user_query =', user_query)
+    # Get the chain from the user session
+    settings = cl.user_session.get("settings")
+    raqa_chain = settings["chain"]
+    # Generate the response from the chain
+    query_response = raqa_chain.invoke({"question" : user_query})
+    query_answer = query_response["response"].content
+    print('query_answer =', query_answer)
+    # Create and send the message stream
+    msg = cl.Message(content=query_answer)
+    await msg.send()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+ipykernel
+numpy
+pandas
+langchain
+langchain-core
+langchain-community
+langchain-openai
+qdrant-client
+tiktoken
+pymupdf
+wandb
+chainlit
+huggingface_hub

utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import tiktoken
+from langchain.document_loaders import PyMuPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_community.vectorstores import Qdrant
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from operator import itemgetter
+# from langchain.schema.output_parser import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough
+def tiktoken_len(text):
+    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
+        text,
+    )
+    return len(tokens)
+def chunk_documents(docs, tiktoken_len):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size = 200,
+        chunk_overlap = 0,
+        length_function = tiktoken_len,
+    )
+    split_chunks = text_splitter.split_documents(docs)
+    print('len(split_chunks) =', len(split_chunks))
+    return split_chunks
+def create_raqa_chain_from_docs():
+    # Load the documents from a PDF file using PyMuPDFLoader
+    # docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
+    docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
+    # Print the number of loaded documents
+    print("Loaded", len(docs), "documents")
+    # Print the first document
+    print(docs[0])
+    # Split the documents into chunks based on their length
+    split_chunks = chunk_documents(docs, tiktoken_len)
+    # Create an instance of the OpenAIEmbeddings model for text embeddings
+    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
+    # Create a Qdrant vector store from the split chunks
+    qdrant_vectorstore = Qdrant.from_documents(
+        split_chunks,
+        embedding_model,
+        location=":memory:",
+        collection_name="Meta 10-k Filings",
+    )
+    # Create a retriever from the Qdrant vector store
+    qdrant_retriever = qdrant_vectorstore.as_retriever()
+    # Define the RAG prompt template
+    RAG_PROMPT = """
+    CONTEXT:
+    {context}
+    QUERY:
+    {question}
+    Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
+    """
+    # Create a ChatPromptTemplate instance from the RAG prompt template
+    rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
+    # Create an instance of the ChatOpenAI model
+    openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
+    # Define the retrieval augmented QA chain
+    retrieval_augmented_qa_chain = (
+        {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
+        | RunnablePassthrough.assign(context=itemgetter("context"))
+        | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
+    )
+    return retrieval_augmented_qa_chain