whymath commited on
Commit
e997a79
·
1 Parent(s): 1bea032

Adding base files for RAQA prototype notebook and chainlit app

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. Dockerfile +11 -0
  3. QA_PDF_LangChain.ipynb +471 -0
  4. app.py +42 -0
  5. requirements.txt +13 -0
  6. utils.py +86 -0
.gitignore CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
 
1
+ wandb/
2
+
3
  # Byte-compiled / optimized / DLL files
4
  __pycache__/
5
  *.py[cod]
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
QA_PDF_LangChain.ipynb ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Dependencies and Initial Setup"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Requirement already satisfied: numpy in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (1.26.4)\n",
20
+ "Requirement already satisfied: langchain in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.1.17)\n",
21
+ "Requirement already satisfied: langchain-core in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.1.48)\n",
22
+ "Requirement already satisfied: langchain-community in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.0.36)\n",
23
+ "Requirement already satisfied: langchain-openai in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.1.5)\n",
24
+ "Requirement already satisfied: qdrant-client in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (1.9.0)\n",
25
+ "Requirement already satisfied: tiktoken in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.6.0)\n",
26
+ "Requirement already satisfied: pymupdf in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (1.24.2)\n",
27
+ "Requirement already satisfied: wandb in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (0.16.6)\n",
28
+ "Requirement already satisfied: PyYAML>=5.3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (6.0.1)\n",
29
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (2.0.29)\n",
30
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (3.9.5)\n",
31
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (0.5.14)\n",
32
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (1.33)\n",
33
+ "Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (0.0.1)\n",
34
+ "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (0.1.52)\n",
35
+ "Requirement already satisfied: pydantic<3,>=1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (2.7.1)\n",
36
+ "Requirement already satisfied: requests<3,>=2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (2.31.0)\n",
37
+ "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain) (8.2.3)\n",
38
+ "Requirement already satisfied: packaging<24.0,>=23.2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain-core) (23.2)\n",
39
+ "Requirement already satisfied: openai<2.0.0,>=1.10.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langchain-openai) (1.25.0)\n",
40
+ "Requirement already satisfied: grpcio>=1.41.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (1.63.0)\n",
41
+ "Requirement already satisfied: grpcio-tools>=1.41.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (1.62.2)\n",
42
+ "Requirement already satisfied: httpx>=0.20.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (0.27.0)\n",
43
+ "Requirement already satisfied: portalocker<3.0.0,>=2.7.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (2.8.2)\n",
44
+ "Requirement already satisfied: urllib3<3,>=1.26.14 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from qdrant-client) (2.2.1)\n",
45
+ "Requirement already satisfied: regex>=2022.1.18 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from tiktoken) (2024.4.28)\n",
46
+ "Requirement already satisfied: PyMuPDFb==1.24.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from pymupdf) (1.24.1)\n",
47
+ "Requirement already satisfied: Click!=8.0.0,>=7.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (8.1.7)\n",
48
+ "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (3.1.43)\n",
49
+ "Requirement already satisfied: psutil>=5.0.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (5.9.8)\n",
50
+ "Requirement already satisfied: sentry-sdk>=1.0.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (2.0.1)\n",
51
+ "Requirement already satisfied: docker-pycreds>=0.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (0.4.0)\n",
52
+ "Requirement already satisfied: setproctitle in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (1.3.3)\n",
53
+ "Requirement already satisfied: setuptools in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (69.5.1)\n",
54
+ "Requirement already satisfied: appdirs>=1.4.3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (1.4.4)\n",
55
+ "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from wandb) (4.25.3)\n",
56
+ "Requirement already satisfied: aiosignal>=1.1.2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
57
+ "Requirement already satisfied: attrs>=17.3.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
58
+ "Requirement already satisfied: frozenlist>=1.1.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
59
+ "Requirement already satisfied: multidict<7.0,>=4.5 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
60
+ "Requirement already satisfied: yarl<2.0,>=1.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
61
+ "Requirement already satisfied: colorama in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from Click!=8.0.0,>=7.1->wandb) (0.4.6)\n",
62
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.2)\n",
63
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
64
+ "Requirement already satisfied: six>=1.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
65
+ "Requirement already satisfied: gitdb<5,>=4.0.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.11)\n",
66
+ "Requirement already satisfied: anyio in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (3.7.1)\n",
67
+ "Requirement already satisfied: certifi in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (2024.2.2)\n",
68
+ "Requirement already satisfied: httpcore==1.* in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (1.0.5)\n",
69
+ "Requirement already satisfied: idna in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (3.7)\n",
70
+ "Requirement already satisfied: sniffio in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (1.3.1)\n",
71
+ "Requirement already satisfied: h11<0.15,>=0.13 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpcore==1.*->httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (0.14.0)\n",
72
+ "Requirement already satisfied: h2<5,>=3 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (4.1.0)\n",
73
+ "Requirement already satisfied: jsonpointer>=1.9 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
74
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.2)\n",
75
+ "Requirement already satisfied: distro<2,>=1.7.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from openai<2.0.0,>=1.10.0->langchain-openai) (1.9.0)\n",
76
+ "Requirement already satisfied: tqdm>4 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from openai<2.0.0,>=1.10.0->langchain-openai) (4.66.2)\n",
77
+ "Requirement already satisfied: typing-extensions<5,>=4.7 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from openai<2.0.0,>=1.10.0->langchain-openai) (4.11.0)\n",
78
+ "Requirement already satisfied: pywin32>=226 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from portalocker<3.0.0,>=2.7.0->qdrant-client) (306)\n",
79
+ "Requirement already satisfied: annotated-types>=0.4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
80
+ "Requirement already satisfied: pydantic-core==2.18.2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from pydantic<3,>=1->langchain) (2.18.2)\n",
81
+ "Requirement already satisfied: charset-normalizer<4,>=2 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
82
+ "Requirement already satisfied: greenlet!=0.4.17 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
83
+ "Requirement already satisfied: smmap<6,>=3.0.1 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n",
84
+ "Requirement already satisfied: hyperframe<7,>=6.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (6.0.1)\n",
85
+ "Requirement already satisfied: hpack<5,>=4.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (4.0.0)\n",
86
+ "Requirement already satisfied: mypy-extensions>=0.3.0 in d:\\workspaces\\courses\\aimakerspace\\rag-qa-pdf\\.venv\\lib\\site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n",
87
+ "Note: you may need to restart the kernel to use updated packages.\n"
88
+ ]
89
+ }
90
+ ],
91
+ "source": [
92
+ "%pip install -U numpy langchain langchain-core langchain-community langchain-openai qdrant-client tiktoken pymupdf wandb"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 2,
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "name": "stderr",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mymath\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
105
+ ]
106
+ },
107
+ {
108
+ "data": {
109
+ "text/html": [
110
+ "Tracking run with wandb version 0.16.6"
111
+ ],
112
+ "text/plain": [
113
+ "<IPython.core.display.HTML object>"
114
+ ]
115
+ },
116
+ "metadata": {},
117
+ "output_type": "display_data"
118
+ },
119
+ {
120
+ "data": {
121
+ "text/html": [
122
+ "Run data is saved locally in <code>d:\\Workspaces\\Courses\\AIMakerspace\\RAG-QA-PDF\\wandb\\run-20240502_020421-r0mtht4l</code>"
123
+ ],
124
+ "text/plain": [
125
+ "<IPython.core.display.HTML object>"
126
+ ]
127
+ },
128
+ "metadata": {},
129
+ "output_type": "display_data"
130
+ },
131
+ {
132
+ "data": {
133
+ "text/html": [
134
+ "Syncing run <strong><a href='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l' target=\"_blank\">desert-dream-4</a></strong> to <a href='https://wandb.ai/ymath/QA_PDF_LangChain' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
135
+ ],
136
+ "text/plain": [
137
+ "<IPython.core.display.HTML object>"
138
+ ]
139
+ },
140
+ "metadata": {},
141
+ "output_type": "display_data"
142
+ },
143
+ {
144
+ "data": {
145
+ "text/html": [
146
+ " View project at <a href='https://wandb.ai/ymath/QA_PDF_LangChain' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain</a>"
147
+ ],
148
+ "text/plain": [
149
+ "<IPython.core.display.HTML object>"
150
+ ]
151
+ },
152
+ "metadata": {},
153
+ "output_type": "display_data"
154
+ },
155
+ {
156
+ "data": {
157
+ "text/html": [
158
+ " View run at <a href='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l</a>"
159
+ ],
160
+ "text/plain": [
161
+ "<IPython.core.display.HTML object>"
162
+ ]
163
+ },
164
+ "metadata": {},
165
+ "output_type": "display_data"
166
+ },
167
+ {
168
+ "data": {
169
+ "text/html": [
170
+ "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
171
+ ],
172
+ "text/plain": [
173
+ "<wandb.sdk.wandb_run.Run at 0x20a437086b0>"
174
+ ]
175
+ },
176
+ "execution_count": 2,
177
+ "metadata": {},
178
+ "output_type": "execute_result"
179
+ }
180
+ ],
181
+ "source": [
182
+ "import os\n",
183
+ "import wandb\n",
184
+ "import getpass\n",
185
+ "\n",
186
+ "# UNCOMMENT TO ENTER WANDB KEY INTERACTIVELY\n",
187
+ "# wandb_key = getpass.getpass(\"Weights and Biases API Key: \")\n",
188
+ "# os.environ[\"WANDB_API_KEY\"] = wandb_key\n",
189
+ "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"./QA_PDF_LangChain.ipynb\"\n",
190
+ "wandb.init(project=\"QA_PDF_LangChain\")"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 3,
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "# UNCOMMENT TO ENTER OPENAI KEY INTERACTIVELY\n",
200
+ "# os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "metadata": {},
206
+ "source": [
207
+ "# Create Vector Store with Source Documents"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 4,
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "import tiktoken\n",
217
+ "\n",
218
+ "def tiktoken_len(text):\n",
219
+ " tokens = tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode(\n",
220
+ " text,\n",
221
+ " )\n",
222
+ " return len(tokens)"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 5,
228
+ "metadata": {},
229
+ "outputs": [
230
+ {
231
+ "name": "stdout",
232
+ "output_type": "stream",
233
+ "text": [
234
+ "Loaded 147 documents\n",
235
+ "page_content='UNITED STATES\\nSECURITIES AND EXCHANGE COMMISSION\\nWashington, D.C.\\xa020549\\n__________________________\\nFORM 10-K\\n__________________________\\n(Mark One)\\n☒\\xa0\\xa0\\xa0\\xa0ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d)\\xa0OF THE SECURITIES EXCHANGE ACT OF 1934\\nFor the fiscal year ended December\\xa031, 2023\\nor\\n☐\\xa0\\xa0\\xa0\\xa0TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d)\\xa0OF THE SECURITIES EXCHANGE ACT OF 1934\\nFor the transition period from\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0to\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\nCommission File Number:\\xa0001-35551\\n__________________________\\nMeta Platforms, Inc.\\n(Exact name of registrant as specified in its charter)\\n__________________________\\nDelaware\\n20-1665019\\n(State or other jurisdiction of incorporation or organization)\\n(I.R.S. Employer Identification Number)\\n1 Meta Way, Menlo Park, California 94025\\n(Address of principal executive offices and Zip Code)\\n(650)\\xa0543-4800\\n(Registrant\\'s telephone number, including area code)\\n__________________________\\nSecurities registered pursuant to Section 12(b) of the Act:\\nTitle of each class\\nTrading symbol(s)\\nName of each exchange on which registered\\nClass A Common Stock, $0.000006 par value\\nMETA\\nThe Nasdaq Stock Market LLC\\nSecurities registered pursuant to Section 12(g) of the Act: None\\nIndicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☒\\xa0\\xa0No\\xa0\\xa0 ☐\\nIndicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.\\xa0\\xa0\\xa0\\xa0Yes \\xa0☐\\xa0No\\xa0 ☒\\nIndicate by check mark whether the registrant\\xa0(1)\\xa0has filed all reports required to be filed by Section\\xa013 or 15(d) of the Securities Exchange Act of 1934 (Exchange Act) during the preceding\\n12\\xa0months (or for such shorter period that the registrant was required to file such reports), and\\xa0(2)\\xa0has been subject to such filing requirements for the past 90\\xa0days.\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☒\\xa0\\xa0\\xa0\\xa0No\\xa0\\xa0☐\\nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§\\xa0232.405 of this chapter)\\nduring the preceding 12 months (or for such shorter period that the registrant was required to submit such files).\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☒\\xa0\\xa0\\xa0\\xa0No\\xa0\\xa0☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See the definitions\\nof \"large accelerated filer,\" \"accelerated filer,\" \"smaller reporting company,\" and \"emerging growth company\" in Rule 12b-2 of the Exchange Act.\\nLarge accelerated filer\\n☒\\nAccelerated\\xa0filer\\n☐\\nNon-accelerated filer\\n☐\\nSmaller\\xa0reporting\\xa0company\\n☐\\nEmerging growth company\\n☐\\nIf an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or revised financial accounting standards\\nprovided pursuant to Section 13(a) of the Exchange Act. ☐\\nIndicate by check mark whether the registrant has filed a report on and attestation to its management\\'s assessment of the effectiveness of its internal control over financial reporting under Section\\n404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit report. ☒\\nIf securities are registered pursuant to Section 12(b) of the Act, indicate by check mark whether the financial statements of the registrant included in the filing reflect the correction of an error to\\npreviously issued financial statements. ☐\\nIndicate by check mark whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the registrant’s executive officers\\nduring the relevant recovery period pursuant to §240.10D-1(b). ☐\\nIndicate by check mark whether the registrant is a shell company (as defined in Rule 12b-2 of the Exchange Act).\\xa0\\xa0\\xa0\\xa0Yes\\xa0\\xa0☐\\xa0\\xa0\\xa0\\xa0No\\xa0\\xa0 ☒\\nThe aggregate market value of the voting and non-voting stock held by non-affiliates of the registrant as of June\\xa030, 2023, the last business day of the registrant\\'s most recently completed second fiscal\\nquarter, was $637\\xa0billion based upon the closing price reported for such date on the Nasdaq Global Select Market. On January\\xa026, 2024, the registrant had 2,200,048,907 shares of Class\\xa0A common\\nstock and 349,356,199 shares of Class B common stock outstanding.\\n' metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'file_path': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'page': 0, 'total_pages': 147, 'format': 'PDF 1.4', 'title': '0001326801-24-000012', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K filed on 2024-02-02 for the period ending 2023-12-31', 'keywords': '0001326801-24-000012; ; 10-K', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240202060356-05'00'\", 'modDate': \"D:20240202060413-05'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4'}\n"
236
+ ]
237
+ }
238
+ ],
239
+ "source": [
240
+ "from langchain.document_loaders import PyMuPDFLoader\n",
241
+ "\n",
242
+ "# docs = PyMuPDFLoader(\"data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf\").load()\n",
243
+ "docs = PyMuPDFLoader(\"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf\").load()\n",
244
+ "\n",
245
+ "print(\"Loaded\", len(docs), \"documents\")\n",
246
+ "print(docs[0])"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 6,
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "data": {
256
+ "text/plain": [
257
+ "663"
258
+ ]
259
+ },
260
+ "execution_count": 6,
261
+ "metadata": {},
262
+ "output_type": "execute_result"
263
+ }
264
+ ],
265
+ "source": [
266
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
267
+ "\n",
268
+ "text_splitter = RecursiveCharacterTextSplitter(\n",
269
+ " chunk_size = 200,\n",
270
+ " chunk_overlap = 0,\n",
271
+ " length_function = tiktoken_len,\n",
272
+ ")\n",
273
+ "\n",
274
+ "split_chunks = text_splitter.split_documents(docs)\n",
275
+ "\n",
276
+ "len(split_chunks)"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": 7,
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "from langchain_openai.embeddings import OpenAIEmbeddings\n",
286
+ "\n",
287
+ "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 8,
293
+ "metadata": {},
294
+ "outputs": [],
295
+ "source": [
296
+ "from langchain_community.vectorstores import Qdrant\n",
297
+ "\n",
298
+ "qdrant_vectorstore = Qdrant.from_documents(\n",
299
+ " split_chunks,\n",
300
+ " embedding_model,\n",
301
+ " location=\":memory:\",\n",
302
+ " collection_name=\"Meta 10-k Filings\",\n",
303
+ ")\n",
304
+ "\n",
305
+ "qdrant_retriever = qdrant_vectorstore.as_retriever()"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "markdown",
310
+ "metadata": {},
311
+ "source": [
312
+ "# Create Chain"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 9,
318
+ "metadata": {},
319
+ "outputs": [],
320
+ "source": [
321
+ "from langchain_core.prompts import ChatPromptTemplate\n",
322
+ "\n",
323
+ "RAG_PROMPT = \"\"\"\n",
324
+ "CONTEXT:\n",
325
+ "{context}\n",
326
+ "\n",
327
+ "QUERY:\n",
328
+ "{question}\n",
329
+ "\n",
330
+ "Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with \"I don't know\".\n",
331
+ "\"\"\"\n",
332
+ "\n",
333
+ "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": 10,
339
+ "metadata": {},
340
+ "outputs": [],
341
+ "source": [
342
+ "from langchain_openai import ChatOpenAI\n",
343
+ "\n",
344
+ "openai_chat_model = ChatOpenAI(model=\"gpt-3.5-turbo\")"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "code",
349
+ "execution_count": 11,
350
+ "metadata": {},
351
+ "outputs": [],
352
+ "source": [
353
+ "from operator import itemgetter\n",
354
+ "# from langchain.schema.output_parser import StrOutputParser\n",
355
+ "from langchain.schema.runnable import RunnablePassthrough\n",
356
+ "\n",
357
+ "retrieval_augmented_qa_chain = (\n",
358
+ " {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
359
+ " | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
360
+ " | {\"response\": rag_prompt | openai_chat_model, \"context\": itemgetter(\"context\")}\n",
361
+ ")"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "markdown",
366
+ "metadata": {},
367
+ "source": [
368
+ "# Test Outputs"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 12,
374
+ "metadata": {},
375
+ "outputs": [
376
+ {
377
+ "data": {
378
+ "text/plain": [
379
+ "\"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $65.40 billion.\""
380
+ ]
381
+ },
382
+ "execution_count": 12,
383
+ "metadata": {},
384
+ "output_type": "execute_result"
385
+ }
386
+ ],
387
+ "source": [
388
+ "question_txt = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n",
389
+ "response = retrieval_augmented_qa_chain.invoke({\"question\" : question_txt})\n",
390
+ "response[\"response\"].content"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 13,
396
+ "metadata": {},
397
+ "outputs": [
398
+ {
399
+ "data": {
400
+ "text/plain": [
401
+ "'The Directors of Meta, as mentioned in the provided context, are Peggy Alford, Marc L. Andreessen, Andrew W. Houston, Nancy Killefer, Robert M. Kimmitt, Sheryl K. Sandberg, Tracey T. Travis, and Tony Xu.'"
402
+ ]
403
+ },
404
+ "execution_count": 13,
405
+ "metadata": {},
406
+ "output_type": "execute_result"
407
+ }
408
+ ],
409
+ "source": [
410
+ "question_txt = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n",
411
+ "response = retrieval_augmented_qa_chain.invoke({\"question\" : question_txt})\n",
412
+ "response[\"response\"].content"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 14,
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "data": {
422
+ "text/html": [
423
+ " View run <strong style=\"color:#cdcd00\">desert-dream-4</strong> at: <a href='https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain/runs/r0mtht4l</a><br/> View project at: <a href='https://wandb.ai/ymath/QA_PDF_LangChain' target=\"_blank\">https://wandb.ai/ymath/QA_PDF_LangChain</a><br/>Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
424
+ ],
425
+ "text/plain": [
426
+ "<IPython.core.display.HTML object>"
427
+ ]
428
+ },
429
+ "metadata": {},
430
+ "output_type": "display_data"
431
+ },
432
+ {
433
+ "data": {
434
+ "text/html": [
435
+ "Find logs at: <code>.\\wandb\\run-20240502_020421-r0mtht4l\\logs</code>"
436
+ ],
437
+ "text/plain": [
438
+ "<IPython.core.display.HTML object>"
439
+ ]
440
+ },
441
+ "metadata": {},
442
+ "output_type": "display_data"
443
+ }
444
+ ],
445
+ "source": [
446
+ "wandb.finish()"
447
+ ]
448
+ }
449
+ ],
450
+ "metadata": {
451
+ "kernelspec": {
452
+ "display_name": ".venv",
453
+ "language": "python",
454
+ "name": "python3"
455
+ },
456
+ "language_info": {
457
+ "codemirror_mode": {
458
+ "name": "ipython",
459
+ "version": 3
460
+ },
461
+ "file_extension": ".py",
462
+ "mimetype": "text/x-python",
463
+ "name": "python",
464
+ "nbconvert_exporter": "python",
465
+ "pygments_lexer": "ipython3",
466
+ "version": "3.12.2"
467
+ }
468
+ },
469
+ "nbformat": 4,
470
+ "nbformat_minor": 2
471
+ }
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # You can find this code for Chainlit python streaming here (https://docs.chainlit.io/concepts/streaming/python)
2
+
3
+ # OpenAI Chat completion
4
+ import os
5
+ from openai import AsyncOpenAI # importing openai for API usage
6
+ import chainlit as cl # importing chainlit for our app
7
+ from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
8
+ from chainlit.playground.providers import ChatOpenAI # importing ChatOpenAI tools
9
+ from dotenv import load_dotenv
10
+ import utils
11
+
12
+
13
+ load_dotenv()
14
+
15
+
16
+ @cl.on_chat_start
17
+ async def start_chat():
18
+ raqa_chain = utils.create_raqa_chain_from_docs()
19
+ settings = {
20
+ "chain": raqa_chain
21
+ }
22
+ cl.user_session.set("settings", settings)
23
+
24
+
25
+ @cl.on_message
26
+ async def main(message: cl.Message):
27
+ # Print the message content
28
+ user_query = message.content
29
+ print('user_query =', user_query)
30
+
31
+ # Get the chain from the user session
32
+ settings = cl.user_session.get("settings")
33
+ raqa_chain = settings["chain"]
34
+
35
+ # Generate the response from the chain
36
+ query_response = raqa_chain.invoke({"question" : user_query})
37
+ query_answer = query_response["response"].content
38
+ print('query_answer =', query_answer)
39
+
40
+ # Create and send the message stream
41
+ msg = cl.Message(content=query_answer)
42
+ await msg.send()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipykernel
2
+ numpy
3
+ pandas
4
+ langchain
5
+ langchain-core
6
+ langchain-community
7
+ langchain-openai
8
+ qdrant-client
9
+ tiktoken
10
+ pymupdf
11
+ wandb
12
+ chainlit
13
+ huggingface_hub
utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ from langchain.document_loaders import PyMuPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
5
+ from langchain_community.vectorstores import Qdrant
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_openai import ChatOpenAI
8
+ from operator import itemgetter
9
+ # from langchain.schema.output_parser import StrOutputParser
10
+ from langchain.schema.runnable import RunnablePassthrough
11
+
12
+
13
+ def tiktoken_len(text):
14
+ tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
15
+ text,
16
+ )
17
+ return len(tokens)
18
+
19
+
20
+ def chunk_documents(docs, tiktoken_len):
21
+ text_splitter = RecursiveCharacterTextSplitter(
22
+ chunk_size = 200,
23
+ chunk_overlap = 0,
24
+ length_function = tiktoken_len,
25
+ )
26
+
27
+ split_chunks = text_splitter.split_documents(docs)
28
+
29
+ print('len(split_chunks) =', len(split_chunks))
30
+
31
+ return split_chunks
32
+
33
+
34
+ def create_raqa_chain_from_docs():
35
+ # Load the documents from a PDF file using PyMuPDFLoader
36
+ # docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
37
+ docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
38
+
39
+ # Print the number of loaded documents
40
+ print("Loaded", len(docs), "documents")
41
+
42
+ # Print the first document
43
+ print(docs[0])
44
+
45
+ # Split the documents into chunks based on their length
46
+ split_chunks = chunk_documents(docs, tiktoken_len)
47
+
48
+ # Create an instance of the OpenAIEmbeddings model for text embeddings
49
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
50
+
51
+ # Create a Qdrant vector store from the split chunks
52
+ qdrant_vectorstore = Qdrant.from_documents(
53
+ split_chunks,
54
+ embedding_model,
55
+ location=":memory:",
56
+ collection_name="Meta 10-k Filings",
57
+ )
58
+
59
+ # Create a retriever from the Qdrant vector store
60
+ qdrant_retriever = qdrant_vectorstore.as_retriever()
61
+
62
+ # Define the RAG prompt template
63
+ RAG_PROMPT = """
64
+ CONTEXT:
65
+ {context}
66
+
67
+ QUERY:
68
+ {question}
69
+
70
+ Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
71
+ """
72
+
73
+ # Create a ChatPromptTemplate instance from the RAG prompt template
74
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
75
+
76
+ # Create an instance of the ChatOpenAI model
77
+ openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
78
+
79
+ # Define the retrieval augmented QA chain
80
+ retrieval_augmented_qa_chain = (
81
+ {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
82
+ | RunnablePassthrough.assign(context=itemgetter("context"))
83
+ | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
84
+ )
85
+
86
+ return retrieval_augmented_qa_chain