Prakhar209 commited on
Commit
0670089
Β·
1 Parent(s): 06aff0c

Upload 2 files

Browse files
Files changed (2) hide show
  1. environment.yml +180 -0
  2. kg_creation.ipynb +441 -0
environment.yml ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: graph_rag
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - ca-certificates=2024.3.11=h06a4308_0
8
+ - ld_impl_linux-64=2.38=h1181459_1
9
+ - libffi=3.4.4=h6a678d5_0
10
+ - libgcc-ng=11.2.0=h1234567_1
11
+ - libgomp=11.2.0=h1234567_1
12
+ - libstdcxx-ng=11.2.0=h1234567_1
13
+ - ncurses=6.4=h6a678d5_0
14
+ - openssl=3.0.13=h7f8727e_0
15
+ - pip=23.3.1=py39h06a4308_0
16
+ - python=3.9.19=h955ad1f_0
17
+ - readline=8.2=h5eee18b_0
18
+ - setuptools=68.2.2=py39h06a4308_0
19
+ - sqlite=3.41.2=h5eee18b_0
20
+ - tk=8.6.12=h1ccaba5_0
21
+ - wheel=0.41.2=py39h06a4308_0
22
+ - xz=5.4.6=h5eee18b_0
23
+ - zlib=1.2.13=h5eee18b_0
24
+ - pip:
25
+ - aiohttp==3.9.5
26
+ - aiosignal==1.3.1
27
+ - annotated-types==0.6.0
28
+ - anyio==4.3.0
29
+ - argon2-cffi==23.1.0
30
+ - argon2-cffi-bindings==21.2.0
31
+ - arrow==1.3.0
32
+ - asttokens==2.4.1
33
+ - async-lru==2.0.4
34
+ - async-timeout==4.0.3
35
+ - attrs==23.2.0
36
+ - babel==2.14.0
37
+ - beautifulsoup4==4.12.3
38
+ - bleach==6.1.0
39
+ - certifi==2024.2.2
40
+ - cffi==1.16.0
41
+ - charset-normalizer==3.3.2
42
+ - click==8.1.7
43
+ - comm==0.2.2
44
+ - dataclasses-json==0.6.4
45
+ - debugpy==1.8.1
46
+ - decorator==5.1.1
47
+ - defusedxml==0.7.1
48
+ - deprecated==1.2.14
49
+ - dirtyjson==1.0.8
50
+ - distro==1.9.0
51
+ - exceptiongroup==1.2.1
52
+ - executing==2.0.1
53
+ - fastjsonschema==2.19.1
54
+ - fqdn==1.5.1
55
+ - frozenlist==1.4.1
56
+ - fsspec==2024.3.1
57
+ - greenlet==3.0.3
58
+ - h11==0.14.0
59
+ - httpcore==1.0.5
60
+ - httpx==0.27.0
61
+ - idna==3.7
62
+ - importlib-metadata==7.1.0
63
+ - ipykernel==6.29.4
64
+ - ipython==8.18.1
65
+ - isoduration==20.11.0
66
+ - jedi==0.19.1
67
+ - jinja2==3.1.3
68
+ - joblib==1.4.0
69
+ - json5==0.9.25
70
+ - jsonpatch==1.33
71
+ - jsonpointer==2.4
72
+ - jsonschema==4.21.1
73
+ - jsonschema-specifications==2023.12.1
74
+ - jupyter-client==8.6.1
75
+ - jupyter-core==5.7.2
76
+ - jupyter-events==0.10.0
77
+ - jupyter-lsp==2.2.5
78
+ - jupyter-server==2.14.0
79
+ - jupyter-server-terminals==0.5.3
80
+ - jupyterlab==4.1.6
81
+ - jupyterlab-pygments==0.3.0
82
+ - jupyterlab-server==2.26.0
83
+ - langchain==0.1.16
84
+ - langchain-community==0.0.34
85
+ - langchain-core==0.1.45
86
+ - langchain-openai==0.1.3
87
+ - langchain-text-splitters==0.0.1
88
+ - langsmith==0.1.49
89
+ - llama-index==0.10.30
90
+ - llama-index-agent-openai==0.2.2
91
+ - llama-index-cli==0.1.12
92
+ - llama-index-core==0.10.30
93
+ - llama-index-embeddings-openai==0.1.8
94
+ - llama-index-indices-managed-llama-cloud==0.1.5
95
+ - llama-index-legacy==0.9.48
96
+ - llama-index-llms-openai==0.1.16
97
+ - llama-index-multi-modal-llms-openai==0.1.5
98
+ - llama-index-program-openai==0.1.5
99
+ - llama-index-question-gen-openai==0.1.3
100
+ - llama-index-readers-file==0.1.19
101
+ - llama-index-readers-llama-parse==0.1.4
102
+ - llama-parse==0.4.1
103
+ - llamaindex-py-client==0.1.18
104
+ - markupsafe==2.1.5
105
+ - marshmallow==3.21.1
106
+ - matplotlib-inline==0.1.7
107
+ - mistune==3.0.2
108
+ - multidict==6.0.5
109
+ - mypy-extensions==1.0.0
110
+ - nbclient==0.10.0
111
+ - nbconvert==7.16.3
112
+ - nbformat==5.10.4
113
+ - neo4j==5.19.0
114
+ - nest-asyncio==1.6.0
115
+ - networkx==3.2.1
116
+ - nltk==3.8.1
117
+ - notebook==7.1.3
118
+ - notebook-shim==0.2.4
119
+ - numpy==1.26.4
120
+ - openai==1.23.2
121
+ - orjson==3.10.1
122
+ - overrides==7.7.0
123
+ - packaging==23.2
124
+ - pandas==2.2.2
125
+ - pandocfilters==1.5.1
126
+ - parso==0.8.4
127
+ - pexpect==4.9.0
128
+ - pillow==10.3.0
129
+ - platformdirs==4.2.0
130
+ - prometheus-client==0.20.0
131
+ - prompt-toolkit==3.0.43
132
+ - psutil==5.9.8
133
+ - ptyprocess==0.7.0
134
+ - pure-eval==0.2.2
135
+ - pycparser==2.22
136
+ - pydantic==2.7.0
137
+ - pydantic-core==2.18.1
138
+ - pygments==2.17.2
139
+ - pypdf==4.2.0
140
+ - python-dateutil==2.9.0.post0
141
+ - python-json-logger==2.0.7
142
+ - pytz==2024.1
143
+ - pyyaml==6.0.1
144
+ - pyzmq==26.0.2
145
+ - referencing==0.34.0
146
+ - regex==2024.4.16
147
+ - requests==2.31.0
148
+ - rfc3339-validator==0.1.4
149
+ - rfc3986-validator==0.1.1
150
+ - rpds-py==0.18.0
151
+ - send2trash==1.8.3
152
+ - six==1.16.0
153
+ - sniffio==1.3.1
154
+ - soupsieve==2.5
155
+ - sqlalchemy==2.0.29
156
+ - stack-data==0.6.3
157
+ - striprtf==0.0.26
158
+ - tenacity==8.2.3
159
+ - terminado==0.18.1
160
+ - tiktoken==0.6.0
161
+ - tinycss2==1.2.1
162
+ - tomli==2.0.1
163
+ - tornado==6.4
164
+ - tqdm==4.66.2
165
+ - traitlets==5.14.3
166
+ - types-python-dateutil==2.9.0.20240316
167
+ - typing-extensions==4.11.0
168
+ - typing-inspect==0.9.0
169
+ - tzdata==2024.1
170
+ - uri-template==1.3.0
171
+ - urllib3==2.2.1
172
+ - wcwidth==0.2.13
173
+ - webcolors==1.13
174
+ - webencodings==0.5.1
175
+ - websocket-client==1.7.0
176
+ - wikipedia==1.4.0
177
+ - wrapt==1.16.0
178
+ - yarl==1.9.4
179
+ - zipp==3.18.1
180
+ prefix: /local/home/pbhandari/miniconda3/envs/graph_rag
kg_creation.ipynb ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 20,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.environ['OPENAI_API_KEY'] = \"sk-proj-k8uMlsAJbdAuSWWnvaHyT3BlbkFJyQB8yMQavFuQDVmc4sNs\"\n",
11
+ "\n",
12
+ "import logging\n",
13
+ "import sys\n",
14
+ "\n",
15
+ "logging.basicConfig(\n",
16
+ " stream=sys.stdout, level=logging.INFO\n",
17
+ ") # logging.DEBUG for more verbose output\n",
18
+ "\n",
19
+ "\n",
20
+ "# define LLM\n",
21
+ "from llama_index.llms.openai import OpenAI\n",
22
+ "from llama_index.core import Settings\n",
23
+ "\n",
24
+ "Settings.llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\")\n",
25
+ "Settings.chunk_size = 512"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 21,
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "name": "stdout",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "Requirement already satisfied: langchain in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.1.16)\n",
38
+ "Requirement already satisfied: neo4j in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (5.19.0)\n",
39
+ "Requirement already satisfied: openai in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (1.23.2)\n",
40
+ "Requirement already satisfied: wikipedia in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (1.4.0)\n",
41
+ "Requirement already satisfied: tiktoken in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.6.0)\n",
42
+ "Requirement already satisfied: langchain_openai in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.1.3)\n",
43
+ "Requirement already satisfied: PyYAML>=5.3 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (6.0.1)\n",
44
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.0.29)\n",
45
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (3.9.5)\n",
46
+ "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (4.0.3)\n",
47
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.6.4)\n",
48
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (1.33)\n",
49
+ "Requirement already satisfied: langchain-community<0.1,>=0.0.32 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.0.34)\n",
50
+ "Requirement already satisfied: langchain-core<0.2.0,>=0.1.42 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.1.45)\n",
51
+ "Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.0.1)\n",
52
+ "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.1.49)\n",
53
+ "Requirement already satisfied: numpy<2,>=1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (1.26.4)\n",
54
+ "Requirement already satisfied: pydantic<3,>=1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.7.0)\n",
55
+ "Requirement already satisfied: requests<3,>=2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.31.0)\n",
56
+ "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (8.2.3)\n",
57
+ "Requirement already satisfied: pytz in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from neo4j) (2024.1)\n",
58
+ "Requirement already satisfied: anyio<5,>=3.5.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.3.0)\n",
59
+ "Requirement already satisfied: distro<2,>=1.7.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (1.9.0)\n",
60
+ "Requirement already satisfied: httpx<1,>=0.23.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (0.27.0)\n",
61
+ "Requirement already satisfied: sniffio in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (1.3.1)\n",
62
+ "Requirement already satisfied: tqdm>4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.66.2)\n",
63
+ "Requirement already satisfied: typing-extensions<5,>=4.7 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.11.0)\n",
64
+ "Requirement already satisfied: beautifulsoup4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from wikipedia) (4.12.3)\n",
65
+ "Requirement already satisfied: regex>=2022.1.18 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from tiktoken) (2024.4.16)\n",
66
+ "Requirement already satisfied: aiosignal>=1.1.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
67
+ "Requirement already satisfied: attrs>=17.3.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
68
+ "Requirement already satisfied: frozenlist>=1.1.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
69
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
70
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
71
+ "Requirement already satisfied: idna>=2.8 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai) (3.7)\n",
72
+ "Requirement already satisfied: exceptiongroup>=1.0.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai) (1.2.1)\n",
73
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.1)\n",
74
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
75
+ "Requirement already satisfied: certifi in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpx<1,>=0.23.0->openai) (2024.2.2)\n",
76
+ "Requirement already satisfied: httpcore==1.* in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpx<1,>=0.23.0->openai) (1.0.5)\n",
77
+ "Requirement already satisfied: h11<0.15,>=0.13 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
78
+ "Requirement already satisfied: jsonpointer>=1.9 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
79
+ "Requirement already satisfied: packaging<24.0,>=23.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain-core<0.2.0,>=0.1.42->langchain) (23.2)\n",
80
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.1)\n",
81
+ "Requirement already satisfied: annotated-types>=0.4.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
82
+ "Requirement already satisfied: pydantic-core==2.18.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (2.18.1)\n",
83
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
84
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from requests<3,>=2->langchain) (2.2.1)\n",
85
+ "Requirement already satisfied: greenlet!=0.4.17 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
86
+ "Requirement already satisfied: soupsieve>1.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from beautifulsoup4->wikipedia) (2.5)\n",
87
+ "Requirement already satisfied: mypy-extensions>=0.3.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
88
+ ]
89
+ }
90
+ ],
91
+ "source": [
92
+ "!pip install langchain neo4j openai wikipedia tiktoken langchain_openai"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 22,
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "from langchain.graphs import Neo4jGraph\n",
102
+ "\n",
103
+ "url = \"neo4j+s://2f409740.databases.neo4j.io\"\n",
104
+ "username =\"neo4j\"\n",
105
+ "password = \"oe7A9ugxhxcuEtwci8khPIt2TTdz_am9AYDx1r9e9Tw\"\n",
106
+ "graph = Neo4jGraph(\n",
107
+ " url=url,\n",
108
+ " username=username,\n",
109
+ " password=password\n",
110
+ ")"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 23,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "from langchain_community.graphs.graph_document import (\n",
120
+ " Node as BaseNode,\n",
121
+ " Relationship as BaseRelationship,\n",
122
+ " GraphDocument,\n",
123
+ ")\n",
124
+ "from langchain.schema import Document\n",
125
+ "from typing import List, Dict, Any, Optional\n",
126
+ "from langchain.pydantic_v1 import Field, BaseModel\n",
127
+ "\n",
128
+ "class Property(BaseModel):\n",
129
+ " \"\"\"A single property consisting of key and value\"\"\"\n",
130
+ " key: str = Field(..., description=\"key\")\n",
131
+ " value: str = Field(..., description=\"value\")\n",
132
+ "\n",
133
+ "class Node(BaseNode):\n",
134
+ " properties: Optional[List[Property]] = Field(\n",
135
+ " None, description=\"List of node properties\")\n",
136
+ "\n",
137
+ "class Relationship(BaseRelationship):\n",
138
+ " properties: Optional[List[Property]] = Field(\n",
139
+ " None, description=\"List of relationship properties\"\n",
140
+ " )\n",
141
+ "\n",
142
+ "class KnowledgeGraph(BaseModel):\n",
143
+ " \"\"\"Generate a knowledge graph with entities and relationships.\"\"\"\n",
144
+ " nodes: List[Node] = Field(\n",
145
+ " ..., description=\"List of nodes in the knowledge graph\")\n",
146
+ " rels: List[Relationship] = Field(\n",
147
+ " ..., description=\"List of relationships in the knowledge graph\"\n",
148
+ " )"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 24,
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "def format_property_key(s: str) -> str:\n",
158
+ " words = s.split()\n",
159
+ " if not words:\n",
160
+ " return s\n",
161
+ " first_word = words[0].lower()\n",
162
+ " capitalized_words = [word.capitalize() for word in words[1:]]\n",
163
+ " return \"\".join([first_word] + capitalized_words)\n",
164
+ "\n",
165
+ "def props_to_dict(props) -> dict:\n",
166
+ " \"\"\"Convert properties to a dictionary.\"\"\"\n",
167
+ " properties = {}\n",
168
+ " if not props:\n",
169
+ " return properties\n",
170
+ " for p in props:\n",
171
+ " properties[format_property_key(p.key)] = p.value\n",
172
+ " return properties\n",
173
+ "\n",
174
+ "def map_to_base_node(node: Node) -> BaseNode:\n",
175
+ " \"\"\"Map the KnowledgeGraph Node to the base Node.\"\"\"\n",
176
+ " properties = props_to_dict(node.properties) if node.properties else {}\n",
177
+ " # Add name property for better Cypher statement generation\n",
178
+ " properties[\"name\"] = node.id.title()\n",
179
+ " return BaseNode(\n",
180
+ " id=node.id.title(), type=node.type.capitalize(), properties=properties\n",
181
+ " )\n",
182
+ "\n",
183
+ "\n",
184
+ "def map_to_base_relationship(rel: Relationship) -> BaseRelationship:\n",
185
+ " \"\"\"Map the KnowledgeGraph Relationship to the base Relationship.\"\"\"\n",
186
+ " source = map_to_base_node(rel.source)\n",
187
+ " target = map_to_base_node(rel.target)\n",
188
+ " properties = props_to_dict(rel.properties) if rel.properties else {}\n",
189
+ " return BaseRelationship(\n",
190
+ " source=source, target=target, type=rel.type, properties=properties\n",
191
+ " )"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 25,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "import os\n",
201
+ "from langchain.chains.openai_functions import (\n",
202
+ " create_openai_fn_chain,\n",
203
+ " create_structured_output_chain,\n",
204
+ ")\n",
205
+ "from langchain_openai import ChatOpenAI\n",
206
+ "from langchain.prompts import ChatPromptTemplate\n",
207
+ "\n",
208
+ "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-k8uMlsAJbdAuSWWnvaHyT3BlbkFJyQB8yMQavFuQDVmc4sNs\"\n",
209
+ "llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\", temperature=0)\n",
210
+ "\n",
211
+ "def get_extraction_chain(\n",
212
+ " allowed_nodes: Optional[List[str]] = None,\n",
213
+ " allowed_rels: Optional[List[str]] = None\n",
214
+ " ):\n",
215
+ " prompt = ChatPromptTemplate.from_messages(\n",
216
+ " [(\n",
217
+ " \"system\",\n",
218
+ " f\"\"\"# Knowledge Graph Instructions for GPT-4\n",
219
+ "## 1. Overview\n",
220
+ "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n",
221
+ "- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.\n",
222
+ "- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.\n",
223
+ "## 2. Labeling Nodes\n",
224
+ "- **Consistency**: Ensure you use basic or elementary types for node labels.\n",
225
+ " - For example, when you identify an entity representing a person, always label it as **\"person\"**. Avoid using more specific terms like \"mathematician\" or \"scientist\".\n",
226
+ "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n",
227
+ "{'- **Allowed Node Labels:**' + \", \".join(allowed_nodes) if allowed_nodes else \"\"}\n",
228
+ "{'- **Allowed Relationship Types**:' + \", \".join(allowed_rels) if allowed_rels else \"\"}\n",
229
+ "## 3. Handling Numerical Data and Dates\n",
230
+ "- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.\n",
231
+ "- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.\n",
232
+ "- **Property Format**: Properties must be in a key-value format.\n",
233
+ "- **Quotation Marks**: Never use escaped single or double quotes within property values.\n",
234
+ "- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.\n",
235
+ "## 4. Coreference Resolution\n",
236
+ "- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n",
237
+ "If an entity, such as \"John Doe\", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., \"Joe\", \"he\"),\n",
238
+ "always use the most complete identifier for that entity throughout the knowledge graph. In this example, use \"John Doe\" as the entity ID.\n",
239
+ "Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n",
240
+ "## 5. Strict Compliance\n",
241
+ "Adhere to the rules strictly. Non-compliance will result in termination.\n",
242
+ " \"\"\"),\n",
243
+ " (\"human\", \"Use the given format to extract information from the following input: {input}\"),\n",
244
+ " (\"human\", \"Tip: Make sure to answer in the correct format\"),\n",
245
+ " ])\n",
246
+ " return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 26,
252
+ "metadata": {},
253
+ "outputs": [],
254
+ "source": [
255
+ "def extract_and_store_graph(\n",
256
+ " document: Document,\n",
257
+ " nodes:Optional[List[str]] = None,\n",
258
+ " rels:Optional[List[str]]=None) -> None:\n",
259
+ " # Extract graph data using OpenAI functions\n",
260
+ " extract_chain = get_extraction_chain(nodes, rels)\n",
261
+ " data = extract_chain.invoke(document.page_content)['function']\n",
262
+ " # Construct a graph document\n",
263
+ " graph_document = GraphDocument(\n",
264
+ " nodes = [map_to_base_node(node) for node in data.nodes],\n",
265
+ " relationships = [map_to_base_relationship(rel) for rel in data.rels],\n",
266
+ " source = document\n",
267
+ " )\n",
268
+ " # Store information into a graph\n",
269
+ " graph.add_graph_documents([graph_document])"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 27,
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "from langchain.document_loaders import WikipediaLoader\n",
279
+ "from langchain.text_splitter import TokenTextSplitter\n",
280
+ "\n",
281
+ "# Read the wikipedia article\n",
282
+ "raw_documents = WikipediaLoader(query=\"Chemotherapy\").load()\n",
283
+ "# Define chunking strategy\n",
284
+ "text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)\n",
285
+ "\n",
286
+ "# Only take the first the raw_documents\n",
287
+ "documents = text_splitter.split_documents(raw_documents[:3])"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 28,
293
+ "metadata": {},
294
+ "outputs": [
295
+ {
296
+ "name": "stderr",
297
+ "output_type": "stream",
298
+ "text": [
299
+ " 0%| | 0/3 [00:00<?, ?it/s]"
300
+ ]
301
+ },
302
+ {
303
+ "name": "stdout",
304
+ "output_type": "stream",
305
+ "text": [
306
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 429 Too Many Requests\"\n",
307
+ "INFO:openai._base_client:Retrying request to /chat/completions in 0.903231 seconds\n",
308
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
309
+ ]
310
+ },
311
+ {
312
+ "name": "stderr",
313
+ "output_type": "stream",
314
+ "text": [
315
+ " 33%|β–ˆβ–ˆβ–ˆβ–Ž | 1/3 [00:44<01:29, 44.80s/it]"
316
+ ]
317
+ },
318
+ {
319
+ "name": "stdout",
320
+ "output_type": "stream",
321
+ "text": [
322
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
323
+ ]
324
+ },
325
+ {
326
+ "name": "stderr",
327
+ "output_type": "stream",
328
+ "text": [
329
+ " 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2/3 [00:59<00:27, 27.11s/it]"
330
+ ]
331
+ },
332
+ {
333
+ "name": "stdout",
334
+ "output_type": "stream",
335
+ "text": [
336
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
337
+ ]
338
+ },
339
+ {
340
+ "name": "stderr",
341
+ "output_type": "stream",
342
+ "text": [
343
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [01:07<00:00, 22.36s/it]\n"
344
+ ]
345
+ }
346
+ ],
347
+ "source": [
348
+ "from tqdm import tqdm\n",
349
+ "\n",
350
+ "for i, d in tqdm(enumerate(documents), total=len(documents)):\n",
351
+ " extract_and_store_graph(d)"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 29,
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "# Query the knowledge graph in a RAG application\n",
361
+ "from langchain.chains import GraphCypherQAChain\n",
362
+ "\n",
363
+ "graph.refresh_schema()\n",
364
+ "\n",
365
+ "cypher_chain = GraphCypherQAChain.from_llm(\n",
366
+ " graph=graph,\n",
367
+ " cypher_llm=ChatOpenAI(temperature=0, model=\"gpt-4\"),\n",
368
+ " qa_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-16k\"),\n",
369
+ " validate_cypher=True, # Validate relationship directions\n",
370
+ " verbose=True\n",
371
+ ")"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 31,
377
+ "metadata": {},
378
+ "outputs": [
379
+ {
380
+ "name": "stdout",
381
+ "output_type": "stream",
382
+ "text": [
383
+ "\n",
384
+ "\n",
385
+ "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
386
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
387
+ "Generated Cypher:\n",
388
+ "\u001b[32;1m\u001b[1;3mMATCH (c:Concept {name: \"hormonal therapies\"})-[:ARE_NOW_CALLED]->(newName:Concept) RETURN newName.name\u001b[0m\n",
389
+ "Full Context:\n",
390
+ "\u001b[32;1m\u001b[1;3m[]\u001b[0m\n",
391
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
392
+ "\n",
393
+ "\u001b[1m> Finished chain.\u001b[0m\n"
394
+ ]
395
+ },
396
+ {
397
+ "data": {
398
+ "text/plain": [
399
+ "{'query': 'What are hormonal therapies now called?',\n",
400
+ " 'result': \"I'm sorry, but I don't have the information to answer your question.\"}"
401
+ ]
402
+ },
403
+ "execution_count": 31,
404
+ "metadata": {},
405
+ "output_type": "execute_result"
406
+ }
407
+ ],
408
+ "source": [
409
+ "cypher_chain.invoke({\"query\": \"What are hormonal therapies now called?\"})"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": null,
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": []
418
+ }
419
+ ],
420
+ "metadata": {
421
+ "kernelspec": {
422
+ "display_name": "my_project_env",
423
+ "language": "python",
424
+ "name": "python3"
425
+ },
426
+ "language_info": {
427
+ "codemirror_mode": {
428
+ "name": "ipython",
429
+ "version": 3
430
+ },
431
+ "file_extension": ".py",
432
+ "mimetype": "text/x-python",
433
+ "name": "python",
434
+ "nbconvert_exporter": "python",
435
+ "pygments_lexer": "ipython3",
436
+ "version": "3.9.19"
437
+ }
438
+ },
439
+ "nbformat": 4,
440
+ "nbformat_minor": 2
441
+ }