binaychandra commited on
Commit
adf2969
·
1 Parent(s): 06e5979

added project files

Browse files
Files changed (11) hide show
  1. .gitignore +160 -0
  2. .streamlit/config.toml +18 -0
  3. LICENSE +21 -0
  4. agent.ipynb +413 -0
  5. app.py +96 -0
  6. ecomm.db +0 -0
  7. fakedatagenerator.ipynb +680 -0
  8. few_shots.py +182 -0
  9. langchain_helper.py +181 -0
  10. project_prompts.py +24 -0
  11. requirements.txt +15 -0
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
.streamlit/config.toml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+
3
+ # Primary accent for interactive elements
4
+ primaryColor = '#FFFF00'
5
+
6
+ # Background color for the main content area
7
+ backgroundColor = '#00172B'
8
+
9
+ # Background color for sidebar and most interactive widgets
10
+ secondaryBackgroundColor = '#000000'
11
+
12
+ # Color used for almost all text
13
+ textColor = '#FFFFFF'
14
+
15
+ # Font family for all text in the app, except code blocks
16
+ # Accepted values (serif | sans serif | monospace)
17
+ # Default: "sans serif"
18
+ font = "sans serif"
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 binaychandra
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
agent.ipynb ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from langchain.agents.agent_types import AgentType\n",
10
+ "from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent\n",
11
+ "from langchain_openai import AzureOpenAI"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "data": {
21
+ "text/plain": [
22
+ "True"
23
+ ]
24
+ },
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "output_type": "execute_result"
28
+ }
29
+ ],
30
+ "source": [
31
+ "from dotenv import load_dotenv\n",
32
+ "load_dotenv()"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 5,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "llm = AzureOpenAI(deployment_name=\"gpt-35-turbo-instruct\", temperature=0.6)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 6,
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "data": {
51
+ "text/html": [
52
+ "<div>\n",
53
+ "<style scoped>\n",
54
+ " .dataframe tbody tr th:only-of-type {\n",
55
+ " vertical-align: middle;\n",
56
+ " }\n",
57
+ "\n",
58
+ " .dataframe tbody tr th {\n",
59
+ " vertical-align: top;\n",
60
+ " }\n",
61
+ "\n",
62
+ " .dataframe thead th {\n",
63
+ " text-align: right;\n",
64
+ " }\n",
65
+ "</style>\n",
66
+ "<table border=\"1\" class=\"dataframe\">\n",
67
+ " <thead>\n",
68
+ " <tr style=\"text-align: right;\">\n",
69
+ " <th></th>\n",
70
+ " <th>PassengerId</th>\n",
71
+ " <th>Survived</th>\n",
72
+ " <th>Pclass</th>\n",
73
+ " <th>Name</th>\n",
74
+ " <th>Sex</th>\n",
75
+ " <th>Age</th>\n",
76
+ " <th>SibSp</th>\n",
77
+ " <th>Parch</th>\n",
78
+ " <th>Ticket</th>\n",
79
+ " <th>Fare</th>\n",
80
+ " <th>Cabin</th>\n",
81
+ " <th>Embarked</th>\n",
82
+ " </tr>\n",
83
+ " </thead>\n",
84
+ " <tbody>\n",
85
+ " <tr>\n",
86
+ " <th>0</th>\n",
87
+ " <td>1</td>\n",
88
+ " <td>0</td>\n",
89
+ " <td>3</td>\n",
90
+ " <td>Braund, Mr. Owen Harris</td>\n",
91
+ " <td>male</td>\n",
92
+ " <td>22.0</td>\n",
93
+ " <td>1</td>\n",
94
+ " <td>0</td>\n",
95
+ " <td>A/5 21171</td>\n",
96
+ " <td>7.2500</td>\n",
97
+ " <td>NaN</td>\n",
98
+ " <td>S</td>\n",
99
+ " </tr>\n",
100
+ " <tr>\n",
101
+ " <th>1</th>\n",
102
+ " <td>2</td>\n",
103
+ " <td>1</td>\n",
104
+ " <td>1</td>\n",
105
+ " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
106
+ " <td>female</td>\n",
107
+ " <td>38.0</td>\n",
108
+ " <td>1</td>\n",
109
+ " <td>0</td>\n",
110
+ " <td>PC 17599</td>\n",
111
+ " <td>71.2833</td>\n",
112
+ " <td>C85</td>\n",
113
+ " <td>C</td>\n",
114
+ " </tr>\n",
115
+ " <tr>\n",
116
+ " <th>2</th>\n",
117
+ " <td>3</td>\n",
118
+ " <td>1</td>\n",
119
+ " <td>3</td>\n",
120
+ " <td>Heikkinen, Miss. Laina</td>\n",
121
+ " <td>female</td>\n",
122
+ " <td>26.0</td>\n",
123
+ " <td>0</td>\n",
124
+ " <td>0</td>\n",
125
+ " <td>STON/O2. 3101282</td>\n",
126
+ " <td>7.9250</td>\n",
127
+ " <td>NaN</td>\n",
128
+ " <td>S</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>3</th>\n",
132
+ " <td>4</td>\n",
133
+ " <td>1</td>\n",
134
+ " <td>1</td>\n",
135
+ " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
136
+ " <td>female</td>\n",
137
+ " <td>35.0</td>\n",
138
+ " <td>1</td>\n",
139
+ " <td>0</td>\n",
140
+ " <td>113803</td>\n",
141
+ " <td>53.1000</td>\n",
142
+ " <td>C123</td>\n",
143
+ " <td>S</td>\n",
144
+ " </tr>\n",
145
+ " <tr>\n",
146
+ " <th>4</th>\n",
147
+ " <td>5</td>\n",
148
+ " <td>0</td>\n",
149
+ " <td>3</td>\n",
150
+ " <td>Allen, Mr. William Henry</td>\n",
151
+ " <td>male</td>\n",
152
+ " <td>35.0</td>\n",
153
+ " <td>0</td>\n",
154
+ " <td>0</td>\n",
155
+ " <td>373450</td>\n",
156
+ " <td>8.0500</td>\n",
157
+ " <td>NaN</td>\n",
158
+ " <td>S</td>\n",
159
+ " </tr>\n",
160
+ " </tbody>\n",
161
+ "</table>\n",
162
+ "</div>"
163
+ ],
164
+ "text/plain": [
165
+ " PassengerId Survived Pclass \\\n",
166
+ "0 1 0 3 \n",
167
+ "1 2 1 1 \n",
168
+ "2 3 1 3 \n",
169
+ "3 4 1 1 \n",
170
+ "4 5 0 3 \n",
171
+ "\n",
172
+ " Name Sex Age SibSp \\\n",
173
+ "0 Braund, Mr. Owen Harris male 22.0 1 \n",
174
+ "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
175
+ "2 Heikkinen, Miss. Laina female 26.0 0 \n",
176
+ "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
177
+ "4 Allen, Mr. William Henry male 35.0 0 \n",
178
+ "\n",
179
+ " Parch Ticket Fare Cabin Embarked \n",
180
+ "0 0 A/5 21171 7.2500 NaN S \n",
181
+ "1 0 PC 17599 71.2833 C85 C \n",
182
+ "2 0 STON/O2. 3101282 7.9250 NaN S \n",
183
+ "3 0 113803 53.1000 C123 S \n",
184
+ "4 0 373450 8.0500 NaN S "
185
+ ]
186
+ },
187
+ "execution_count": 6,
188
+ "metadata": {},
189
+ "output_type": "execute_result"
190
+ }
191
+ ],
192
+ "source": [
193
+ "import pandas as pd\n",
194
+ "\n",
195
+ "df = pd.read_csv(\n",
196
+ " \"https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv\"\n",
197
+ ")\n",
198
+ "df.head()"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 14,
204
+ "metadata": {},
205
+ "outputs": [],
206
+ "source": [
207
+ "agent = create_pandas_dataframe_agent(\n",
208
+ " llm,\n",
209
+ " df,\n",
210
+ " verbose=True,\n",
211
+ " agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
212
+ " return_intermediate_steps=True\n",
213
+ ")"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": 17,
219
+ "metadata": {},
220
+ "outputs": [
221
+ {
222
+ "name": "stdout",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "\n",
226
+ "\n",
227
+ "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
228
+ "\u001b[32;1m\u001b[1;3mThought: We need to filter the dataframe for rows where the \"Sex\" column is equal to \"female\" and then count the number of rows.\n",
229
+ "Action: python_repl_ast\n",
230
+ "Action Input: df[df[\"Sex\"] == \"female\"].count()\u001b[0m\u001b[36;1m\u001b[1;3mPassengerId 314\n",
231
+ "Survived 314\n",
232
+ "Pclass 314\n",
233
+ "Name 314\n",
234
+ "Sex 314\n",
235
+ "Age 261\n",
236
+ "SibSp 314\n",
237
+ "Parch 314\n",
238
+ "Ticket 314\n",
239
+ "Fare 314\n",
240
+ "Cabin 97\n",
241
+ "Embarked 312\n",
242
+ "dtype: int64\u001b[0m\u001b[32;1m\u001b[1;3m314 is the number of females in the dataframe, but we need to specify which column we want to count.\n",
243
+ "Action: python_repl_ast\n",
244
+ "Action Input: df[df[\"Sex\"] == \"female\"][\"Sex\"].count()\u001b[0m\u001b[36;1m\u001b[1;3m314\u001b[0m\u001b[32;1m\u001b[1;3m314 is the final answer to the original input question\n",
245
+ "Final Answer: There are 314 females in the dataframe.\u001b[0m\n",
246
+ "\n",
247
+ "\u001b[1m> Finished chain.\u001b[0m\n"
248
+ ]
249
+ },
250
+ {
251
+ "data": {
252
+ "text/plain": [
253
+ "{'input': 'how many females are there ',\n",
254
+ " 'output': 'There are 314 females in the dataframe.',\n",
255
+ " 'intermediate_steps': [(AgentAction(tool='python_repl_ast', tool_input='df[df[\"Sex\"] == \"female\"].count()', log='Thought: We need to filter the dataframe for rows where the \"Sex\" column is equal to \"female\" and then count the number of rows.\\nAction: python_repl_ast\\nAction Input: df[df[\"Sex\"] == \"female\"].count()'),\n",
256
+ " PassengerId 314\n",
257
+ " Survived 314\n",
258
+ " Pclass 314\n",
259
+ " Name 314\n",
260
+ " Sex 314\n",
261
+ " Age 261\n",
262
+ " SibSp 314\n",
263
+ " Parch 314\n",
264
+ " Ticket 314\n",
265
+ " Fare 314\n",
266
+ " Cabin 97\n",
267
+ " Embarked 312\n",
268
+ " dtype: int64),\n",
269
+ " (AgentAction(tool='python_repl_ast', tool_input='df[df[\"Sex\"] == \"female\"][\"Sex\"].count()', log='314 is the number of females in the dataframe, but we need to specify which column we want to count.\\nAction: python_repl_ast\\nAction Input: df[df[\"Sex\"] == \"female\"][\"Sex\"].count()'),\n",
270
+ " 314)]}"
271
+ ]
272
+ },
273
+ "execution_count": 17,
274
+ "metadata": {},
275
+ "output_type": "execute_result"
276
+ }
277
+ ],
278
+ "source": [
279
+ "outres = agent.invoke('how many females are there ')\n",
280
+ "outres"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 32,
286
+ "metadata": {},
287
+ "outputs": [
288
+ {
289
+ "data": {
290
+ "text/plain": [
291
+ "'df[df[\"Sex\"] == \"female\"].count()'"
292
+ ]
293
+ },
294
+ "execution_count": 32,
295
+ "metadata": {},
296
+ "output_type": "execute_result"
297
+ }
298
+ ],
299
+ "source": [
300
+ "outres['intermediate_steps'][0][0].tool_input"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": 33,
306
+ "metadata": {},
307
+ "outputs": [
308
+ {
309
+ "ename": "SyntaxError",
310
+ "evalue": "invalid syntax (3216326457.py, line 1)",
311
+ "output_type": "error",
312
+ "traceback": [
313
+ "\u001b[1;36m Cell \u001b[1;32mIn[33], line 1\u001b[1;36m\u001b[0m\n\u001b[1;33m 'fig = px.line('x', 'y', param=skdfl);'\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
314
+ ]
315
+ }
316
+ ],
317
+ "source": [
318
+ "'fig = px.line('x', 'y', param=skdfl);'\n",
319
+ "fig'"
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": 34,
325
+ "metadata": {},
326
+ "outputs": [],
327
+ "source": [
328
+ "import plotly.express as px\n",
329
+ "data_canada = px.data.gapminder().query(\"country == 'Canada'\")\n"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": 38,
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "exec(\"x = 'abc'; y =4\")"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 40,
344
+ "metadata": {},
345
+ "outputs": [
346
+ {
347
+ "data": {
348
+ "text/plain": [
349
+ "('abc', 4)"
350
+ ]
351
+ },
352
+ "execution_count": 40,
353
+ "metadata": {},
354
+ "output_type": "execute_result"
355
+ }
356
+ ],
357
+ "source": [
358
+ "x, y"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 36,
364
+ "metadata": {},
365
+ "outputs": [
366
+ {
367
+ "ename": "ValueError",
368
+ "evalue": "Mime type rendering requires nbformat>=4.2.0 but it is not installed",
369
+ "output_type": "error",
370
+ "traceback": [
371
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
372
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
373
+ "Cell \u001b[1;32mIn[36], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
374
+ "File \u001b[1;32mc:\\Users\\PD817AE\\OneDrive - EY\\Desktop\\DataSc\\pepsico_chat\\.venv\\lib\\site-packages\\plotly\\basedatatypes.py:3410\u001b[0m, in \u001b[0;36mBaseFigure.show\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 3377\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3378\u001b[0m \u001b[38;5;124;03mShow a figure using either the default renderer(s) or the renderer(s)\u001b[39;00m\n\u001b[0;32m 3379\u001b[0m \u001b[38;5;124;03mspecified by the renderer argument\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 3406\u001b[0m \u001b[38;5;124;03mNone\u001b[39;00m\n\u001b[0;32m 3407\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3408\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mplotly\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpio\u001b[39;00m\n\u001b[1;32m-> 3410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pio\u001b[38;5;241m.\u001b[39mshow(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
375
+ "File \u001b[1;32mc:\\Users\\PD817AE\\OneDrive - EY\\Desktop\\DataSc\\pepsico_chat\\.venv\\lib\\site-packages\\plotly\\io\\_renderers.py:394\u001b[0m, in \u001b[0;36mshow\u001b[1;34m(fig, renderer, validate, **kwargs)\u001b[0m\n\u001b[0;32m 389\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 390\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMime type rendering requires ipython but it is not installed\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 391\u001b[0m )\n\u001b[0;32m 393\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m nbformat \u001b[38;5;129;01mor\u001b[39;00m Version(nbformat\u001b[38;5;241m.\u001b[39m__version__) \u001b[38;5;241m<\u001b[39m Version(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m4.2.0\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m--> 394\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 395\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMime type rendering requires nbformat>=4.2.0 but it is not installed\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 396\u001b[0m )\n\u001b[0;32m 398\u001b[0m ipython_display\u001b[38;5;241m.\u001b[39mdisplay(bundle, raw\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 400\u001b[0m \u001b[38;5;66;03m# external renderers\u001b[39;00m\n",
376
+ "\u001b[1;31mValueError\u001b[0m: Mime type rendering requires nbformat>=4.2.0 but it is not installed"
377
+ ]
378
+ }
379
+ ],
380
+ "source": [
381
+ "fig.show()"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": null,
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": []
390
+ }
391
+ ],
392
+ "metadata": {
393
+ "kernelspec": {
394
+ "display_name": ".venv",
395
+ "language": "python",
396
+ "name": "python3"
397
+ },
398
+ "language_info": {
399
+ "codemirror_mode": {
400
+ "name": "ipython",
401
+ "version": 3
402
+ },
403
+ "file_extension": ".py",
404
+ "mimetype": "text/x-python",
405
+ "name": "python",
406
+ "nbconvert_exporter": "python",
407
+ "pygments_lexer": "ipython3",
408
+ "version": "3.9.13"
409
+ }
410
+ },
411
+ "nbformat": 4,
412
+ "nbformat_minor": 2
413
+ }
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Thu Apr 25 18:00:03 2024
4
+
5
+ @author: MK529XT
6
+ """
7
+
8
+ import streamlit as st
9
+ import string
10
+ import random
11
+ from langchain_helper import get_few_shot_db_chain
12
+ import plotly.figure_factory as ff
13
+ import numpy as np
14
+
15
+ #st.set_page_config(layout="wide")
16
+
17
+ # CSS for styling
18
+ st.markdown("""
19
+ <style>
20
+ .title {
21
+ text-align: center;
22
+ outline: solid yellow;
23
+ font-size: 20px;
24
+ font-family: Arial, Helvetica, sans-serif;
25
+ color: #FFFFFF;
26
+ padding-top: 5px;
27
+ padding-bottom: 5px;
28
+ #border-bottom: 2px solid #FFFF00;
29
+ background-color: #050201;
30
+ }
31
+ </style>
32
+ """, unsafe_allow_html=True)
33
+
34
+ # Title section
35
+ st.markdown("<h1 class='title'>Manufacturing Process Analysis</h1>", unsafe_allow_html=True)
36
+
37
+ with st.chat_message("assistant"):
38
+ st.write("Hello 👋 How can I help you today?")
39
+
40
+ def random_string() -> dict:
41
+ try:
42
+ response_dict = get_few_shot_db_chain(st.session_state["chat_input"])
43
+ except Exception as e:
44
+ response_dict = {
45
+ "result_df" : None,
46
+ "sql_command" : None,
47
+ "response" : f"LLM ran into issues : {str(e)}",
48
+ "input" : st.session_state["chat_input"],
49
+ "graph_data" : None
50
+ }
51
+ return response_dict
52
+
53
+ def chat_actions():
54
+ st.session_state["chat_history"].append(
55
+ {
56
+ "role": "user",
57
+ "content": st.session_state["chat_input"],
58
+ }
59
+ )
60
+
61
+ st.session_state["chat_history"].append(
62
+ {
63
+ "role": "assistant",
64
+ "content": random_string(),
65
+ },
66
+ )
67
+
68
+
69
+ if "chat_history" not in st.session_state:
70
+ st.session_state["chat_history"] = []
71
+
72
+ st.chat_input("Enter your question", on_submit=chat_actions, key="chat_input")
73
+
74
+ for i in st.session_state["chat_history"]:
75
+ with st.chat_message(name=i["role"]):
76
+ print(type(i["content"]))
77
+ if isinstance(i["content"], str):
78
+ st.write(i["content"])
79
+
80
+ # When this is llm or bot response #
81
+ elif isinstance(i["content"], dict):
82
+ #st.info(i["content"]["sql_command"])
83
+ st.write(i["content"]["response"])
84
+ result_df = i["content"]["result_df"]
85
+ if i['content']["graph_data"] is not None:
86
+ st.plotly_chart(i['content']["graph_data"], use_container_width=True)
87
+ elif (result_df is not None) and ((result_df.shape[0] > 1) and (result_df.shape[1] > 1)) :
88
+ st.plotly_chart(ff.create_table(result_df), use_container_width=True)
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
ecomm.db ADDED
Binary file (983 kB). View file
 
fakedatagenerator.ipynb ADDED
@@ -0,0 +1,680 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 43,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import faker\n",
10
+ "import pandas as pd\n",
11
+ "import random\n",
12
+ "import sqlite3"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 18,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "fake = faker.Faker()"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 28,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "num_records = 250"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 29,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# Generate dataset\n",
40
+ "customers_data = {\n",
41
+ " \"customer_id\": range(1, num_records + 1),\n",
42
+ " \"first_name\": [fake.first_name() for _ in range(num_records)],\n",
43
+ " \"last_name\": [fake.last_name() for _ in range(num_records)],\n",
44
+ " \"email\": [fake.email() for _ in range(num_records)],\n",
45
+ " \"phone_number\": [fake.phone_number() for _ in range(num_records)],\n",
46
+ " \"address\": [fake.street_address() for _ in range(num_records)],\n",
47
+ " \"city\": [fake.city() for _ in range(num_records)],\n",
48
+ " \"state\": [fake.state() for _ in range(num_records)],\n",
49
+ " \"zip_code\": [fake.zipcode() for _ in range(num_records)],\n",
50
+ " \"country\": [fake.country() for _ in range(num_records)],\n",
51
+ " \"date_of_birth\": [fake.date_of_birth().strftime(\"%Y-%m-%d\") for _ in range(num_records)],\n",
52
+ " \"gender\": [random.choice([\"Male\", \"Female\", \"Other\"]) for _ in range(num_records)]\n",
53
+ "}\n",
54
+ "\n",
55
+ "customers_df = pd.DataFrame(customers_data)"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 31,
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "data": {
65
+ "text/html": [
66
+ "<div>\n",
67
+ "<style scoped>\n",
68
+ " .dataframe tbody tr th:only-of-type {\n",
69
+ " vertical-align: middle;\n",
70
+ " }\n",
71
+ "\n",
72
+ " .dataframe tbody tr th {\n",
73
+ " vertical-align: top;\n",
74
+ " }\n",
75
+ "\n",
76
+ " .dataframe thead th {\n",
77
+ " text-align: right;\n",
78
+ " }\n",
79
+ "</style>\n",
80
+ "<table border=\"1\" class=\"dataframe\">\n",
81
+ " <thead>\n",
82
+ " <tr style=\"text-align: right;\">\n",
83
+ " <th></th>\n",
84
+ " <th>customer_id</th>\n",
85
+ " <th>first_name</th>\n",
86
+ " <th>last_name</th>\n",
87
+ " <th>email</th>\n",
88
+ " <th>phone_number</th>\n",
89
+ " <th>address</th>\n",
90
+ " <th>city</th>\n",
91
+ " <th>state</th>\n",
92
+ " <th>zip_code</th>\n",
93
+ " <th>country</th>\n",
94
+ " <th>date_of_birth</th>\n",
95
+ " <th>gender</th>\n",
96
+ " </tr>\n",
97
+ " </thead>\n",
98
+ " <tbody>\n",
99
+ " <tr>\n",
100
+ " <th>0</th>\n",
101
+ " <td>1</td>\n",
102
+ " <td>Daniel</td>\n",
103
+ " <td>Day</td>\n",
104
+ " <td>hvalencia@example.net</td>\n",
105
+ " <td>(671)991-3668</td>\n",
106
+ " <td>2712 Matthew Course Apt. 519</td>\n",
107
+ " <td>Reginashire</td>\n",
108
+ " <td>Virginia</td>\n",
109
+ " <td>43739</td>\n",
110
+ " <td>Portugal</td>\n",
111
+ " <td>1955-02-11</td>\n",
112
+ " <td>Male</td>\n",
113
+ " </tr>\n",
114
+ " <tr>\n",
115
+ " <th>1</th>\n",
116
+ " <td>2</td>\n",
117
+ " <td>Lucas</td>\n",
118
+ " <td>Jimenez</td>\n",
119
+ " <td>jennifer95@example.org</td>\n",
120
+ " <td>694.215.1833</td>\n",
121
+ " <td>560 Victoria Shoals Apt. 465</td>\n",
122
+ " <td>Marshallmouth</td>\n",
123
+ " <td>Oklahoma</td>\n",
124
+ " <td>90653</td>\n",
125
+ " <td>Albania</td>\n",
126
+ " <td>1909-06-06</td>\n",
127
+ " <td>Female</td>\n",
128
+ " </tr>\n",
129
+ " <tr>\n",
130
+ " <th>2</th>\n",
131
+ " <td>3</td>\n",
132
+ " <td>Victoria</td>\n",
133
+ " <td>Willis</td>\n",
134
+ " <td>millersean@example.org</td>\n",
135
+ " <td>769-267-3445</td>\n",
136
+ " <td>58325 Buck Road Suite 830</td>\n",
137
+ " <td>South Pamelaborough</td>\n",
138
+ " <td>Oregon</td>\n",
139
+ " <td>73729</td>\n",
140
+ " <td>Lithuania</td>\n",
141
+ " <td>1925-09-12</td>\n",
142
+ " <td>Other</td>\n",
143
+ " </tr>\n",
144
+ " <tr>\n",
145
+ " <th>3</th>\n",
146
+ " <td>4</td>\n",
147
+ " <td>Austin</td>\n",
148
+ " <td>Carr</td>\n",
149
+ " <td>arnoldjennifer@example.com</td>\n",
150
+ " <td>874-821-2653x36986</td>\n",
151
+ " <td>01855 Peterson View Apt. 956</td>\n",
152
+ " <td>Potterton</td>\n",
153
+ " <td>Wyoming</td>\n",
154
+ " <td>80500</td>\n",
155
+ " <td>Dominica</td>\n",
156
+ " <td>1920-06-23</td>\n",
157
+ " <td>Other</td>\n",
158
+ " </tr>\n",
159
+ " <tr>\n",
160
+ " <th>4</th>\n",
161
+ " <td>5</td>\n",
162
+ " <td>Ethan</td>\n",
163
+ " <td>Martin</td>\n",
164
+ " <td>mark46@example.org</td>\n",
165
+ " <td>875-454-9228</td>\n",
166
+ " <td>617 Clayton Tunnel</td>\n",
167
+ " <td>Adamsport</td>\n",
168
+ " <td>Michigan</td>\n",
169
+ " <td>38936</td>\n",
170
+ " <td>Yemen</td>\n",
171
+ " <td>1985-03-13</td>\n",
172
+ " <td>Female</td>\n",
173
+ " </tr>\n",
174
+ " </tbody>\n",
175
+ "</table>\n",
176
+ "</div>"
177
+ ],
178
+ "text/plain": [
179
+ " customer_id first_name last_name email \\\n",
180
+ "0 1 Daniel Day hvalencia@example.net \n",
181
+ "1 2 Lucas Jimenez jennifer95@example.org \n",
182
+ "2 3 Victoria Willis millersean@example.org \n",
183
+ "3 4 Austin Carr arnoldjennifer@example.com \n",
184
+ "4 5 Ethan Martin mark46@example.org \n",
185
+ "\n",
186
+ " phone_number address city \\\n",
187
+ "0 (671)991-3668 2712 Matthew Course Apt. 519 Reginashire \n",
188
+ "1 694.215.1833 560 Victoria Shoals Apt. 465 Marshallmouth \n",
189
+ "2 769-267-3445 58325 Buck Road Suite 830 South Pamelaborough \n",
190
+ "3 874-821-2653x36986 01855 Peterson View Apt. 956 Potterton \n",
191
+ "4 875-454-9228 617 Clayton Tunnel Adamsport \n",
192
+ "\n",
193
+ " state zip_code country date_of_birth gender \n",
194
+ "0 Virginia 43739 Portugal 1955-02-11 Male \n",
195
+ "1 Oklahoma 90653 Albania 1909-06-06 Female \n",
196
+ "2 Oregon 73729 Lithuania 1925-09-12 Other \n",
197
+ "3 Wyoming 80500 Dominica 1920-06-23 Other \n",
198
+ "4 Michigan 38936 Yemen 1985-03-13 Female "
199
+ ]
200
+ },
201
+ "execution_count": 31,
202
+ "metadata": {},
203
+ "output_type": "execute_result"
204
+ }
205
+ ],
206
+ "source": [
207
+ "customers_df.head()"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 25,
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "# Set seed for reproducibility\n",
217
+ "random.seed(42)\n",
218
+ "\n",
219
+ "# Define number of records\n",
220
+ "num_records = 1000\n",
221
+ "\n",
222
+ "# Generate dataset\n",
223
+ "items_data = {\n",
224
+ " \"id\": range(1, num_records + 1),\n",
225
+ " \"product_name\": [fake.catch_phrase() for _ in range(num_records)],\n",
226
+ " \"description\": [fake.paragraph(nb_sentences=3) for _ in range(num_records)],\n",
227
+ " \"price\": [round(random.uniform(10.0, 100.0), 2) for _ in range(num_records)],\n",
228
+ " \"category\": [random.choice([\"Electronics\", \"Fashion\", \"Home Goods\", \"Sports\", \"Toys\"]) for _ in range(num_records)],\n",
229
+ " \"sub_category\": [\n",
230
+ " random.choice([\n",
231
+ " \"Smartphones\", \"Laptops\", \"Tablets\",\n",
232
+ " \"Women's Clothing\", \"Men's Clothing\", \"Kids' Clothing\",\n",
233
+ " \"Kitchen Appliances\", \"Home Decor\", \"Furniture\",\n",
234
+ " \"Fitness Equipment\", \"Outdoor Gear\", \"Toys & Games\"\n",
235
+ " ]) for _ in range(num_records)\n",
236
+ " ],\n",
237
+ " \"brand\": [fake.company() for _ in range(num_records)],\n",
238
+ " \"rating\": [round(random.uniform(1.0, 5.0), 1) for _ in range(num_records)],\n",
239
+ " \"num_reviews\": [random.randint(1, 100) for _ in range(num_records)],\n",
240
+ " \"stock_quantity\": [random.randint(1, 100) for _ in range(num_records)],\n",
241
+ " \"seller_name\": [fake.name() for _ in range(num_records)],\n",
242
+ " \"shipping_weight\": [round(random.uniform(1.0, 10.0), 2) for _ in range(num_records)],\n",
243
+ " \"shipping_dimension\": [\n",
244
+ " f\"{random.randint(6, 20)} x {random.randint(4, 12)} x {random.randint(2, 8)}\"\n",
245
+ " for _ in range(num_records)\n",
246
+ " ]\n",
247
+ "}\n",
248
+ "\n",
249
+ "items_df = pd.DataFrame(items_data)"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": 26,
255
+ "metadata": {},
256
+ "outputs": [
257
+ {
258
+ "data": {
259
+ "text/html": [
260
+ "<div>\n",
261
+ "<style scoped>\n",
262
+ " .dataframe tbody tr th:only-of-type {\n",
263
+ " vertical-align: middle;\n",
264
+ " }\n",
265
+ "\n",
266
+ " .dataframe tbody tr th {\n",
267
+ " vertical-align: top;\n",
268
+ " }\n",
269
+ "\n",
270
+ " .dataframe thead th {\n",
271
+ " text-align: right;\n",
272
+ " }\n",
273
+ "</style>\n",
274
+ "<table border=\"1\" class=\"dataframe\">\n",
275
+ " <thead>\n",
276
+ " <tr style=\"text-align: right;\">\n",
277
+ " <th></th>\n",
278
+ " <th>id</th>\n",
279
+ " <th>product_name</th>\n",
280
+ " <th>description</th>\n",
281
+ " <th>price</th>\n",
282
+ " <th>category</th>\n",
283
+ " <th>sub_category</th>\n",
284
+ " <th>brand</th>\n",
285
+ " <th>rating</th>\n",
286
+ " <th>num_reviews</th>\n",
287
+ " <th>stock_quantity</th>\n",
288
+ " <th>seller_name</th>\n",
289
+ " <th>shipping_weight</th>\n",
290
+ " <th>shipping_dimension</th>\n",
291
+ " </tr>\n",
292
+ " </thead>\n",
293
+ " <tbody>\n",
294
+ " <tr>\n",
295
+ " <th>0</th>\n",
296
+ " <td>1</td>\n",
297
+ " <td>Ergonomic bottom-line framework</td>\n",
298
+ " <td>Kind stay kid song dream. Yourself would scene...</td>\n",
299
+ " <td>67.55</td>\n",
300
+ " <td>Electronics</td>\n",
301
+ " <td>Men's Clothing</td>\n",
302
+ " <td>Gonzalez, Jones and Hanson</td>\n",
303
+ " <td>3.2</td>\n",
304
+ " <td>52</td>\n",
305
+ " <td>9</td>\n",
306
+ " <td>Kathryn Hansen</td>\n",
307
+ " <td>2.31</td>\n",
308
+ " <td>18 x 6 x 6</td>\n",
309
+ " </tr>\n",
310
+ " <tr>\n",
311
+ " <th>1</th>\n",
312
+ " <td>2</td>\n",
313
+ " <td>Reduced high-level customer loyalty</td>\n",
314
+ " <td>Nothing free around expert decade. Great view ...</td>\n",
315
+ " <td>12.25</td>\n",
316
+ " <td>Home Goods</td>\n",
317
+ " <td>Toys &amp; Games</td>\n",
318
+ " <td>Walker-Love</td>\n",
319
+ " <td>1.7</td>\n",
320
+ " <td>52</td>\n",
321
+ " <td>34</td>\n",
322
+ " <td>Breanna Allison</td>\n",
323
+ " <td>1.40</td>\n",
324
+ " <td>14 x 7 x 2</td>\n",
325
+ " </tr>\n",
326
+ " <tr>\n",
327
+ " <th>2</th>\n",
328
+ " <td>3</td>\n",
329
+ " <td>Phased holistic capacity</td>\n",
330
+ " <td>Fire usually high manage tend available.</td>\n",
331
+ " <td>34.75</td>\n",
332
+ " <td>Toys</td>\n",
333
+ " <td>Laptops</td>\n",
334
+ " <td>Nelson-Morrison</td>\n",
335
+ " <td>2.8</td>\n",
336
+ " <td>59</td>\n",
337
+ " <td>29</td>\n",
338
+ " <td>Allen Hernandez</td>\n",
339
+ " <td>8.36</td>\n",
340
+ " <td>12 x 12 x 5</td>\n",
341
+ " </tr>\n",
342
+ " <tr>\n",
343
+ " <th>3</th>\n",
344
+ " <td>4</td>\n",
345
+ " <td>Quality-focused 6thgeneration matrix</td>\n",
346
+ " <td>Capital onto into eat unit church take ground....</td>\n",
347
+ " <td>30.09</td>\n",
348
+ " <td>Home Goods</td>\n",
349
+ " <td>Kids' Clothing</td>\n",
350
+ " <td>Sullivan, Clark and Larson</td>\n",
351
+ " <td>4.0</td>\n",
352
+ " <td>35</td>\n",
353
+ " <td>48</td>\n",
354
+ " <td>Joseph Hayden</td>\n",
355
+ " <td>2.80</td>\n",
356
+ " <td>19 x 7 x 6</td>\n",
357
+ " </tr>\n",
358
+ " <tr>\n",
359
+ " <th>4</th>\n",
360
+ " <td>5</td>\n",
361
+ " <td>Visionary systemic array</td>\n",
362
+ " <td>Woman former wind bill red authority. Police s...</td>\n",
363
+ " <td>76.28</td>\n",
364
+ " <td>Electronics</td>\n",
365
+ " <td>Home Decor</td>\n",
366
+ " <td>Evans PLC</td>\n",
367
+ " <td>4.1</td>\n",
368
+ " <td>50</td>\n",
369
+ " <td>11</td>\n",
370
+ " <td>John Mcdowell</td>\n",
371
+ " <td>4.36</td>\n",
372
+ " <td>13 x 11 x 4</td>\n",
373
+ " </tr>\n",
374
+ " </tbody>\n",
375
+ "</table>\n",
376
+ "</div>"
377
+ ],
378
+ "text/plain": [
379
+ " id product_name \\\n",
380
+ "0 1 Ergonomic bottom-line framework \n",
381
+ "1 2 Reduced high-level customer loyalty \n",
382
+ "2 3 Phased holistic capacity \n",
383
+ "3 4 Quality-focused 6thgeneration matrix \n",
384
+ "4 5 Visionary systemic array \n",
385
+ "\n",
386
+ " description price category \\\n",
387
+ "0 Kind stay kid song dream. Yourself would scene... 67.55 Electronics \n",
388
+ "1 Nothing free around expert decade. Great view ... 12.25 Home Goods \n",
389
+ "2 Fire usually high manage tend available. 34.75 Toys \n",
390
+ "3 Capital onto into eat unit church take ground.... 30.09 Home Goods \n",
391
+ "4 Woman former wind bill red authority. Police s... 76.28 Electronics \n",
392
+ "\n",
393
+ " sub_category brand rating num_reviews \\\n",
394
+ "0 Men's Clothing Gonzalez, Jones and Hanson 3.2 52 \n",
395
+ "1 Toys & Games Walker-Love 1.7 52 \n",
396
+ "2 Laptops Nelson-Morrison 2.8 59 \n",
397
+ "3 Kids' Clothing Sullivan, Clark and Larson 4.0 35 \n",
398
+ "4 Home Decor Evans PLC 4.1 50 \n",
399
+ "\n",
400
+ " stock_quantity seller_name shipping_weight shipping_dimension \n",
401
+ "0 9 Kathryn Hansen 2.31 18 x 6 x 6 \n",
402
+ "1 34 Breanna Allison 1.40 14 x 7 x 2 \n",
403
+ "2 29 Allen Hernandez 8.36 12 x 12 x 5 \n",
404
+ "3 48 Joseph Hayden 2.80 19 x 7 x 6 \n",
405
+ "4 11 John Mcdowell 4.36 13 x 11 x 4 "
406
+ ]
407
+ },
408
+ "execution_count": 26,
409
+ "metadata": {},
410
+ "output_type": "execute_result"
411
+ }
412
+ ],
413
+ "source": [
414
+ "items_df.head()"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": 32,
420
+ "metadata": {},
421
+ "outputs": [],
422
+ "source": [
423
+ "# Define number of orders\n",
424
+ "num_orders = 5000\n",
425
+ "\n",
426
+ "# Generate orders dataset\n",
427
+ "data = {\n",
428
+ " \"order_id\": range(1, num_orders + 1),\n",
429
+ " \"customer_id\": [random.choice(customers_df[\"customer_id\"]) for _ in range(num_orders)],\n",
430
+ " \"product_id\": [random.choice(items_df[\"id\"]) for _ in range(num_orders)],\n",
431
+ " \"order_date\": [fake.date_time_between(start_date=\"-2y\", end_date=\"now\").strftime(\"%Y-%m-%d %H:%M:%S\") for _ in range(num_orders)],\n",
432
+ " \"order_status\": [random.choice([\"Pending\", \"Shipped\", \"Delivered\", \"Cancelled\"]) for _ in range(num_orders)],\n",
433
+ " \"payment_method\": [random.choice([\"Credit Card\", \"PayPal\", \"Bank Transfer\"]) for _ in range(num_orders)],\n",
434
+ " \"total_amount\": [round(random.uniform(10.0, 100.0), 2) for _ in range(num_orders)],\n",
435
+ " \"shipping_address\": [fake.street_address() for _ in range(num_orders)],\n",
436
+ " \"shipping_city\": [fake.city() for _ in range(num_orders)],\n",
437
+ " \"shipping_state\": [fake.state() for _ in range(num_orders)],\n",
438
+ " \"shipping_zip\": [fake.zipcode() for _ in range(num_orders)],\n",
439
+ " \"shipping_country\": [fake.country() for _ in range(num_orders)]\n",
440
+ "}\n",
441
+ "\n",
442
+ "orders_df = pd.DataFrame(data)"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 33,
448
+ "metadata": {},
449
+ "outputs": [
450
+ {
451
+ "data": {
452
+ "text/html": [
453
+ "<div>\n",
454
+ "<style scoped>\n",
455
+ " .dataframe tbody tr th:only-of-type {\n",
456
+ " vertical-align: middle;\n",
457
+ " }\n",
458
+ "\n",
459
+ " .dataframe tbody tr th {\n",
460
+ " vertical-align: top;\n",
461
+ " }\n",
462
+ "\n",
463
+ " .dataframe thead th {\n",
464
+ " text-align: right;\n",
465
+ " }\n",
466
+ "</style>\n",
467
+ "<table border=\"1\" class=\"dataframe\">\n",
468
+ " <thead>\n",
469
+ " <tr style=\"text-align: right;\">\n",
470
+ " <th></th>\n",
471
+ " <th>order_id</th>\n",
472
+ " <th>customer_id</th>\n",
473
+ " <th>product_id</th>\n",
474
+ " <th>order_date</th>\n",
475
+ " <th>order_status</th>\n",
476
+ " <th>payment_method</th>\n",
477
+ " <th>total_amount</th>\n",
478
+ " <th>shipping_address</th>\n",
479
+ " <th>shipping_city</th>\n",
480
+ " <th>shipping_state</th>\n",
481
+ " <th>shipping_zip</th>\n",
482
+ " <th>shipping_country</th>\n",
483
+ " </tr>\n",
484
+ " </thead>\n",
485
+ " <tbody>\n",
486
+ " <tr>\n",
487
+ " <th>0</th>\n",
488
+ " <td>1</td>\n",
489
+ " <td>85</td>\n",
490
+ " <td>506</td>\n",
491
+ " <td>2024-07-03 08:05:03</td>\n",
492
+ " <td>Pending</td>\n",
493
+ " <td>Credit Card</td>\n",
494
+ " <td>54.40</td>\n",
495
+ " <td>140 Edwards Overpass</td>\n",
496
+ " <td>Kingtown</td>\n",
497
+ " <td>Kansas</td>\n",
498
+ " <td>05046</td>\n",
499
+ " <td>British Virgin Islands</td>\n",
500
+ " </tr>\n",
501
+ " <tr>\n",
502
+ " <th>1</th>\n",
503
+ " <td>2</td>\n",
504
+ " <td>88</td>\n",
505
+ " <td>270</td>\n",
506
+ " <td>2024-09-21 12:08:46</td>\n",
507
+ " <td>Shipped</td>\n",
508
+ " <td>Bank Transfer</td>\n",
509
+ " <td>54.55</td>\n",
510
+ " <td>811 Blair Glen Apt. 318</td>\n",
511
+ " <td>Port Andrew</td>\n",
512
+ " <td>New Jersey</td>\n",
513
+ " <td>46407</td>\n",
514
+ " <td>Liberia</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>2</th>\n",
518
+ " <td>3</td>\n",
519
+ " <td>63</td>\n",
520
+ " <td>89</td>\n",
521
+ " <td>2024-04-28 09:50:13</td>\n",
522
+ " <td>Shipped</td>\n",
523
+ " <td>PayPal</td>\n",
524
+ " <td>38.34</td>\n",
525
+ " <td>35571 Debra Stravenue</td>\n",
526
+ " <td>Warrenhaven</td>\n",
527
+ " <td>Louisiana</td>\n",
528
+ " <td>78358</td>\n",
529
+ " <td>Maldives</td>\n",
530
+ " </tr>\n",
531
+ " <tr>\n",
532
+ " <th>3</th>\n",
533
+ " <td>4</td>\n",
534
+ " <td>53</td>\n",
535
+ " <td>886</td>\n",
536
+ " <td>2024-03-03 22:47:52</td>\n",
537
+ " <td>Pending</td>\n",
538
+ " <td>Bank Transfer</td>\n",
539
+ " <td>46.67</td>\n",
540
+ " <td>45222 Karen Trace Apt. 530</td>\n",
541
+ " <td>Nicoleland</td>\n",
542
+ " <td>North Dakota</td>\n",
543
+ " <td>91684</td>\n",
544
+ " <td>United States Minor Outlying Islands</td>\n",
545
+ " </tr>\n",
546
+ " <tr>\n",
547
+ " <th>4</th>\n",
548
+ " <td>5</td>\n",
549
+ " <td>139</td>\n",
550
+ " <td>141</td>\n",
551
+ " <td>2024-02-06 20:16:53</td>\n",
552
+ " <td>Shipped</td>\n",
553
+ " <td>Bank Transfer</td>\n",
554
+ " <td>11.09</td>\n",
555
+ " <td>61721 Perez Walks Apt. 244</td>\n",
556
+ " <td>Lake Curtischester</td>\n",
557
+ " <td>New York</td>\n",
558
+ " <td>22193</td>\n",
559
+ " <td>Bangladesh</td>\n",
560
+ " </tr>\n",
561
+ " </tbody>\n",
562
+ "</table>\n",
563
+ "</div>"
564
+ ],
565
+ "text/plain": [
566
+ " order_id customer_id product_id order_date order_status \\\n",
567
+ "0 1 85 506 2024-07-03 08:05:03 Pending \n",
568
+ "1 2 88 270 2024-09-21 12:08:46 Shipped \n",
569
+ "2 3 63 89 2024-04-28 09:50:13 Shipped \n",
570
+ "3 4 53 886 2024-03-03 22:47:52 Pending \n",
571
+ "4 5 139 141 2024-02-06 20:16:53 Shipped \n",
572
+ "\n",
573
+ " payment_method total_amount shipping_address \\\n",
574
+ "0 Credit Card 54.40 140 Edwards Overpass \n",
575
+ "1 Bank Transfer 54.55 811 Blair Glen Apt. 318 \n",
576
+ "2 PayPal 38.34 35571 Debra Stravenue \n",
577
+ "3 Bank Transfer 46.67 45222 Karen Trace Apt. 530 \n",
578
+ "4 Bank Transfer 11.09 61721 Perez Walks Apt. 244 \n",
579
+ "\n",
580
+ " shipping_city shipping_state shipping_zip \\\n",
581
+ "0 Kingtown Kansas 05046 \n",
582
+ "1 Port Andrew New Jersey 46407 \n",
583
+ "2 Warrenhaven Louisiana 78358 \n",
584
+ "3 Nicoleland North Dakota 91684 \n",
585
+ "4 Lake Curtischester New York 22193 \n",
586
+ "\n",
587
+ " shipping_country \n",
588
+ "0 British Virgin Islands \n",
589
+ "1 Liberia \n",
590
+ "2 Maldives \n",
591
+ "3 United States Minor Outlying Islands \n",
592
+ "4 Bangladesh "
593
+ ]
594
+ },
595
+ "execution_count": 33,
596
+ "metadata": {},
597
+ "output_type": "execute_result"
598
+ }
599
+ ],
600
+ "source": [
601
+ "orders_df.head()"
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "markdown",
606
+ "metadata": {},
607
+ "source": [
608
+ "Save the dataframe to SQLite"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 56,
614
+ "metadata": {},
615
+ "outputs": [],
616
+ "source": [
617
+ "# Create a connection to the SQLite database\n",
618
+ "conn = sqlite3.connect('ecomm.db')\n",
619
+ "\n",
620
+ "# Save the DataFrame to the SQLite database\n",
621
+ "customers_df.to_sql('customer_details', conn, if_exists='replace', index=False)\n",
622
+ "items_df.to_sql('items', conn, if_exists='replace', index=False)\n",
623
+ "orders_df.to_sql('orders', conn, if_exists='replace', index=False)\n",
624
+ "\n",
625
+ "# Close the connection\n",
626
+ "conn.close()"
627
+ ]
628
+ },
629
+ {
630
+ "cell_type": "markdown",
631
+ "metadata": {},
632
+ "source": [
633
+ "Deleting cusomers table from database"
634
+ ]
635
+ },
636
+ {
637
+ "cell_type": "code",
638
+ "execution_count": 57,
639
+ "metadata": {},
640
+ "outputs": [],
641
+ "source": [
642
+ "# Establish a connection to the database\n",
643
+ "conn = sqlite3.connect('ecomm.db')\n",
644
+ "\n",
645
+ "# Create a cursor object\n",
646
+ "cur = conn.cursor()\n",
647
+ "\n",
648
+ "# Delete the table\n",
649
+ "cur.execute('DROP TABLE customers')\n",
650
+ "\n",
651
+ "# Commit the changes\n",
652
+ "conn.commit()\n",
653
+ "\n",
654
+ "# Close the connection\n",
655
+ "conn.close()"
656
+ ]
657
+ }
658
+ ],
659
+ "metadata": {
660
+ "kernelspec": {
661
+ "display_name": ".venv",
662
+ "language": "python",
663
+ "name": "python3"
664
+ },
665
+ "language_info": {
666
+ "codemirror_mode": {
667
+ "name": "ipython",
668
+ "version": 3
669
+ },
670
+ "file_extension": ".py",
671
+ "mimetype": "text/x-python",
672
+ "name": "python",
673
+ "nbconvert_exporter": "python",
674
+ "pygments_lexer": "ipython3",
675
+ "version": "3.9.13"
676
+ }
677
+ },
678
+ "nbformat": 4,
679
+ "nbformat_minor": 2
680
+ }
few_shots.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ few_shots = [
2
+ {
3
+ 'Question': "Which customers have the highest aggregated purchase amount?",
4
+ 'SQLQuery': """
5
+ SELECT c.customer_id, c.first_name, c.last_name, SUM(o.total_amount) as total_purchase
6
+ FROM customers c
7
+ JOIN orders o ON c.customer_id = o.customer_id
8
+ GROUP BY c.customer_id
9
+ ORDER BY total_purchase DESC
10
+ LIMIT 1;
11
+ """,
12
+ 'SQLResult': "(123, 'John', 'Doe', 543.21)",
13
+ 'Answer': "John Doe with customer ID 123 has the highest aggregated purchase amount of $543.21."
14
+ },
15
+ {
16
+ 'Question': "What is the total revenue generated by all orders?",
17
+ 'SQLQuery': """
18
+ SELECT SUM(total_amount) as total_revenue
19
+ FROM orders;
20
+ """,
21
+ 'SQLResult': "(10000.00)",
22
+ 'Answer': "The total revenue generated by all orders is $10,000.00."
23
+ },
24
+ {
25
+ 'Question': "Which product has been ordered the most?",
26
+ 'SQLQuery': """
27
+ SELECT p.product_name, COUNT(o.product_id) as order_count
28
+ FROM orders o
29
+ JOIN products p ON o.product_id = p.id
30
+ GROUP BY o.product_id
31
+ ORDER BY order_count DESC
32
+ LIMIT 1;
33
+ """,
34
+ 'SQLResult': "('iPhone 13', 50)",
35
+ 'Answer': "The iPhone 13 has been ordered the most, with 50 orders."
36
+ },
37
+ {
38
+ 'Question': "What is the average order value?",
39
+ 'SQLQuery': """
40
+ SELECT AVG(total_amount) as average_order_value
41
+ FROM orders;
42
+ """,
43
+ 'SQLResult': "(50.00)",
44
+ 'Answer': "The average order value is $50.00."
45
+ },
46
+ {
47
+ 'Question': "Which customer has placed the most orders?",
48
+ 'SQLQuery': """
49
+ SELECT c.customer_id, c.first_name, c.last_name, COUNT(o.order_id) as order_count
50
+ FROM customers c
51
+ JOIN orders o ON c.customer_id = o.customer_id
52
+ GROUP BY c.customer_id
53
+ ORDER BY order_count DESC
54
+ LIMIT 1;
55
+ """,
56
+ 'SQLResult': "(123, 'John', 'Doe', 10)",
57
+ 'Answer': "John Doe with customer ID 123 has placed the most orders, with 10 orders."
58
+ },
59
+ {
60
+ 'Question': "What is the total number of unique customers?",
61
+ 'SQLQuery': """
62
+ SELECT COUNT(DISTINCT customer_id) as unique_customers
63
+ FROM orders;
64
+ """,
65
+ 'SQLResult': "(500)",
66
+ 'Answer': "There are 500 unique customers."
67
+ },
68
+ {
69
+ 'Question': "What is the most popular payment method?",
70
+ 'SQLQuery': """
71
+ SELECT payment_method, COUNT(order_id) as order_count
72
+ FROM orders
73
+ GROUP BY payment_method
74
+ ORDER BY order_count DESC
75
+ LIMIT 1;
76
+ """,
77
+ 'SQLResult': "('Credit Card', 300)",
78
+ 'Answer': "The most popular payment method is Credit Card, used in 300 orders."
79
+ },
80
+ {
81
+ 'Question': "Which product category has the highest total revenue?",
82
+ 'SQLQuery': """
83
+ SELECT p.category, SUM(o.total_amount) as total_revenue
84
+ FROM orders o
85
+ JOIN products p ON o.product_id = p.id
86
+ GROUP BY p.category
87
+ ORDER BY total_revenue DESC
88
+ LIMIT 1;
89
+ """,
90
+ 'SQLResult': "('Electronics', 5000.00)",
91
+ 'Answer': "The Electronics category has the highest total revenue of $5,000.00."
92
+ },
93
+ {
94
+ 'Question': "What is the average shipping time for orders?",
95
+ 'SQLQuery': """
96
+ SELECT AVG(DATEDIFF(delivery_date, order_date)) as average_shipping_time
97
+ FROM orders;
98
+ """,
99
+ 'SQLResult': "(3.5)",
100
+ 'Answer': "The average shipping time for orders is 3.5 days."
101
+ },
102
+ {
103
+ 'Question': "Which customer has the highest average order value?",
104
+ 'SQLQuery': """
105
+ SELECT c.customer_id, c.first_name, c.last_name, AVG(o.total_amount) as average_order_value
106
+ FROM customers c
107
+ JOIN orders o ON c.customer_id = o.customer_id
108
+ GROUP BY c.customer_id
109
+ ORDER BY average_order_value DESC
110
+ LIMIT 1;
111
+ """,
112
+ 'SQLResult': "(123, 'John', 'Doe', 100.00)",
113
+ 'Answer': "John Doe with customer ID 123 has the highest average order value of $100.00."
114
+ },
115
+ {
116
+ 'Question': "What is the total number of orders by country?",
117
+ 'SQLQuery': """
118
+ SELECT c.country, COUNT(o.order_id) as order_count
119
+ FROM customers c
120
+ JOIN orders o ON c.customer_id = o.customer_id
121
+ GROUP BY c.country;
122
+ """,
123
+ 'SQLResult': "([('USA', 200), ('Canada', 100), ('Mexico', 50)])",
124
+ 'Answer': "There are 200 orders from the USA, 100 orders from Canada, and 50 orders from Mexico."
125
+ },
126
+ {
127
+ 'Question': "Which product has the highest profit margin?",
128
+ 'SQLQuery': """
129
+ SELECT p.product_name, (p.price - p.cost) / p.price as profit_margin
130
+ FROM products p
131
+ ORDER BY profit_margin DESC
132
+ LIMIT 1;
133
+ """,
134
+ 'SQLResult': "('iPhone 13', 0.30)",
135
+ 'Answer': "The iPhone 13 has the highest profit margin of 30%."
136
+ },
137
+ {
138
+ 'Question': "What is the total revenue by month?",
139
+ 'SQLQuery': """
140
+ SELECT MONTH(o.order_date) as month, SUM(o.total_amount) as total_revenue
141
+ FROM orders o
142
+ GROUP BY MONTH(o.order_date);
143
+ """,
144
+ 'SQLResult': "([(1, 1000.00), (2, 1200.00), (3, 1500.00)])",
145
+ 'Answer': "The total revenue for January is $1,000.00, February is $1,200.00, and March is $1,500.00."
146
+ },
147
+ {
148
+ 'Question': "Which customer has placed orders in the most categories?",
149
+ 'SQLQuery': """
150
+ SELECT c.customer_id, c.first_name, c.last_name, COUNT(DISTINCT p.category) as category_count
151
+ FROM customers c
152
+ JOIN orders o ON c.customer_id = o.customer_id
153
+ JOIN products p ON o.product_id = p.id
154
+ GROUP BY c.customer_id
155
+ ORDER BY category_count DESC
156
+ LIMIT 1;
157
+ """,
158
+ 'SQLResult': "(123, 'John', 'Doe', 5)",
159
+ 'Answer': "John Doe with customer ID 123 has placed orders in 5 different categories."
160
+ },
161
+ {
162
+ 'Question': "What is the average order value by payment method?",
163
+ 'SQLQuery': """
164
+ SELECT o.payment_method, AVG(o.total_amount) as average_order_value
165
+ FROM orders o
166
+ GROUP BY o.payment_method;
167
+ """,
168
+ 'SQLResult': "([('Credit Card', 50.00), ('PayPal', 40.00), ('Bank Transfer', 60.00)])",
169
+ 'Answer': "The average order value for Credit Card is $50.00, PayPal is $40.00, and Bank Transfer is $60.00."
170
+ },
171
+ {
172
+ "Question": "how many orders were cancelled on monthly basis",
173
+ 'SQLQuery': """
174
+ SELECT strftime('%m', order_date) as month, COUNT(order_id) as cancelled_orders
175
+ FROM orders
176
+ WHERE order_status = 'Cancelled'
177
+ GROUP BY month;
178
+ """,
179
+ "SQLResult": "[('01', 108), ('02', 94), ('03', 111), ('04', 104), ('05', 108), ('06', 90), ('07', 117), ('08', 91), ('09', 102), ('10', 90), ('11', 103), ('12', 108)]",
180
+ "Answer": "There were 108 cancelled orders in January, 94 in February, 111 in March, 104 in April, 108 in May, 90 in June, 117 in July, 91 in August, 102 in September, 90 in October, 103 in November, and 108 in December."
181
+ }
182
+ ]
langchain_helper.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_openai import AzureOpenAI
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain.agents.agent_types import AgentType
5
+ from langchain_experimental.agents import create_pandas_dataframe_agent
6
+ from langchain_community.utilities import SQLDatabase
7
+ from langchain_experimental.sql import SQLDatabaseChain
8
+ from langchain.prompts import SemanticSimilarityExampleSelector
9
+ from langchain_openai import AzureOpenAIEmbeddings
10
+ from langchain_community.vectorstores import Chroma
11
+ from langchain.prompts import FewShotPromptTemplate
12
+ from langchain.prompts.prompt import PromptTemplate
13
+ from langchain.chains.sql_database.prompt import PROMPT_SUFFIX, _mysql_prompt
14
+ from sqlalchemy import create_engine
15
+ from project_prompts import sqlite_prompt
16
+ from few_shots import few_shots
17
+ import pandas as pd
18
+ import plotly
19
+ import plotly.express as px
20
+ from plotly.express import bar, line, scatter, area, pie
21
+
22
+ from dotenv import load_dotenv
23
+ load_dotenv()
24
+
25
+ def get_few_shot_db_chain(user_message):
26
+ llm = AzureOpenAI(deployment_name="gpt-35-turbo-instruct", temperature=0.2)
27
+
28
+ engine = create_engine("sqlite:///ecomm.db")
29
+ db = SQLDatabase(engine=engine, sample_rows_in_table_info=3)
30
+
31
+ embeddings = AzureOpenAIEmbeddings(model="text-embedding-3-small")
32
+
33
+ to_vectorize = [" ".join(example.values()) for example in few_shots]
34
+
35
+ vectorstore = Chroma.from_texts(to_vectorize, embeddings, metadatas=few_shots)
36
+
37
+ example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=2)
38
+
39
+ example_prompt = PromptTemplate(
40
+ input_variables=["Question", "SQLQuery", "SQLResult","Answer",],
41
+ template="\nQuestion: {Question}\nSQLQuery: {SQLQuery}\nSQLResult: {SQLResult}\nAnswer: {Answer}"
42
+ )
43
+
44
+ few_shot_prompt = FewShotPromptTemplate(
45
+ example_selector=example_selector,
46
+ example_prompt=example_prompt,
47
+ prefix=sqlite_prompt,
48
+ suffix=PROMPT_SUFFIX,
49
+ input_variables=["input", "table_info", "top_k"]
50
+ )
51
+
52
+ chain = SQLDatabaseChain.from_llm(llm, db, verbose=True, prompt=few_shot_prompt, return_intermediate_steps = True)
53
+
54
+ response_llm = chain.invoke(user_message)
55
+
56
+ print(f"sql query : {response_llm['intermediate_steps'][1]}")
57
+ if 'sql_cmd' in response_llm['intermediate_steps'][2].keys():
58
+ intermediate_sql_query = response_llm['intermediate_steps'][2]['sql_cmd']
59
+
60
+ result_df = pd.read_sql_query(intermediate_sql_query, engine)
61
+
62
+ output_dict = {
63
+ "result_df" : result_df,
64
+ "sql_command" : intermediate_sql_query,
65
+ "response" : response_llm['result'],
66
+ "input" : response_llm['query'],
67
+ "graph_data" : None if ((result_df.shape[0] < 2) | (result_df.shape[1] < 2)) else get_graph_details(user_message, result_df)
68
+ }
69
+
70
+ return output_dict
71
+
72
+
73
+ def get_graph_details(usermessage:str, df=None):
74
+ llm = AzureOpenAI(deployment_name="gpt-35-turbo-instruct", temperature=0.15)
75
+ template = ChatPromptTemplate.from_messages(
76
+ [("system", "You are a visualisation expert and plotly developer, your task is to come up with best suitable \
77
+ chart representing user ask for the given data. please use plotly express library in python for \
78
+ charting purposes.. and provide code for generating the figure.. there should not be any displaying \
79
+ instructions..like fig.show() etc.."),
80
+ ("human", "For the given dataframe below \
81
+ ---------------------------------\
82
+ Dataframe = {dataframe} \
83
+ ---------------------------------\
84
+ and user question \
85
+ ---------------------------------\
86
+ user_ask = {question} \
87
+ ----------------------------------\
88
+ Please provide the plotly chart which \
89
+ would be best suitable to represent the user ask graphically \
90
+ Please double check the code is not having any fig.show() or display commands"
91
+ )]
92
+ )
93
+
94
+ customer_messages = template.format_messages(dataframe = df, question=usermessage)
95
+
96
+ agent = create_pandas_dataframe_agent(
97
+ llm,
98
+ df,
99
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
100
+ verbose=True,
101
+ return_intermediate_steps=True
102
+ )
103
+
104
+ agent_response = agent.invoke(customer_messages)
105
+ out_agent_response = agent_response['intermediate_steps']
106
+
107
+ for _, agent_code_reponse in out_agent_response:
108
+ if isinstance(agent_code_reponse, plotly.graph_objects.Figure):
109
+ fig = agent_code_reponse
110
+ return fig
111
+
112
+ else:
113
+ template = ChatPromptTemplate.from_messages([
114
+ ("system", "You are a visualisation expert and plotly developer, your task is to come up with best suitable \
115
+ chart representing user ask for the given data. please use plotly express library in python for \
116
+ charting purposes.. and provide code for generating the figure.. there should not be any displaying \
117
+ instructions..like fig.show() etc.."),
118
+ ("human", "For the given dataframe below \
119
+ ---------------------------------\
120
+ df = State Total_GDP\
121
+ 0 Florida 7743.0\
122
+ 1 Texas 9934.0\
123
+ 2 New_York 6634.5\
124
+ 3 Denver 4456.0\
125
+ 4 Atlanta 993.5 \
126
+ ---------------------------------\
127
+ and user question \
128
+ ---------------------------------\
129
+ user_ask = What is the distribution of Total_GDP for each state? \
130
+ ----------------------------------\
131
+ Please provide the code using plotly express in less than 30 words which should clearly satisfy user ask\
132
+ in terms of best representation of data. please use dataframe variable as 'df' and \
133
+ strictly output only one line of python code start your code with initializing a figure object \n\
134
+ like `fig = px.`"),
135
+ ("ai", "bar(df, x='State', y='Total_GDP', title='Distribution of Total_GDP per State')"),
136
+ ("human", "This is incorrect.. the required response should be \
137
+ `fig = plt.bar(df, x='Plant_Name', y='Total_Available_Days', title='Distribution of Available Days for Each Plant Name')`\
138
+ as it starts with `fig = plt.` as user specified"),
139
+ ("ai", "Sounds good, now I will remember to start with `fig = plt.`"),
140
+ ("human", "For the given dataframe below \
141
+ ---------------------------------\
142
+ df = {dataframe} \
143
+ ---------------------------------\
144
+ and user question \
145
+ ---------------------------------\
146
+ user_ask = {question} \
147
+ ----------------------------------\
148
+ Please provide the code using plotly express in less than 40 words which should clearly satisfy user ask\
149
+ in terms of best representation of data. please use dataframe variable as 'df' and \
150
+ strictly output only one line of python code start your code with initializing a figure object \n\
151
+ like `fig = px.`"),
152
+ ])
153
+ customer_messages = template.format_messages(dataframe = df, question=usermessage)
154
+ print(f"This is the customer message : {customer_messages}")
155
+ code_response_llm = llm.invoke(customer_messages)
156
+ print(f"This is the code returned by LLM : {code_response_llm}")
157
+ try:
158
+ print("## Executing the code line generated by llm ##")
159
+
160
+ if "fig = " in code_response_llm:
161
+ code_response_llm = code_response_llm.replace("AI: ", "")
162
+ namespace = {'df': df}
163
+ exec(code_response_llm, globals(), namespace)
164
+ if 'fig' in namespace.keys():
165
+ print("fig is there returning fig>>>>>")
166
+ return namespace['fig']
167
+ else:
168
+ return None
169
+ except Exception as e:
170
+ print(f"Some exception occurred : {str(e)}")
171
+ return None
172
+
173
+ return None
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
project_prompts.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sqlite_prompt = """You are a SQLite expert. Given an input question, first create a syntactically correct SQLite query to run, then look at the results of the query and return the answer to the input question.
2
+ Unless the user specifies in the question a specific number of examples to obtain, query for at most 10 results using the LIMIT clause as per SQLite. You can order the results to return the most informative data in the database.
3
+ Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
4
+ Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
5
+ Pay attention to use date('now') function to get the current date, if the question involves "today".
6
+
7
+ Use the following format:
8
+
9
+ Question: Question here
10
+ SQLQuery: SQL Query to run
11
+ SQLResult: Result of the SQLQuery
12
+ Answer: Final answer here
13
+
14
+ Only use the following tables:
15
+ {table_info}
16
+
17
+ Question: {input}
18
+
19
+ If the final answer has a numerical value, convert it into words like 1234123 (One Million), only print whole number.
20
+ If the final answer has a numerical value with a decimal, print it without decimal values.
21
+ If the final answer has a numerical value and some units, print the number with units or metrics.
22
+ If the final answer has multiple decimal points reduce it into two decimal points, for example: if it is like 0.3933333333333333 then convert that into 0.39 and if it is like 161.5760959724 then convert into 161.5.
23
+ For month calculation from the existing table please use strftime formula NOT MONTH function.
24
+ """
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.1.16
2
+ langchain-community==0.0.34
3
+ langchain-core==0.1.45
4
+ langchain-experimental==0.0.57
5
+ langchain-openai==0.1.3
6
+ numpy==1.24.4
7
+ openai==1.23.2
8
+ pandas==2.0.3
9
+ SQLAlchemy==2.0.29
10
+ streamlit==1.33.0
11
+ python-dotenv
12
+ chromadb==0.3.29
13
+ plotly
14
+ tabulate
15
+ Faker