cjber commited on
Commit
ded7051
·
0 Parent(s):

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data/
2
+
3
+ # python generated files
4
+ __pycache__/
5
+ *.py[oc]
6
+ build/
7
+ dist/
8
+ wheels/
9
+ *.egg-info
10
+
11
+ # venv
12
+ .venv
13
+ .old
14
+ .envrc
15
+ .env
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12.2
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # semantic-catalogue
2
+
3
+ Describe your project here.
pyproject.toml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "semantic-catalogue"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ authors = [
6
+ { name = "cjber", email = "cjberragan@gmail.com" }
7
+ ]
8
+ dependencies = [
9
+ "requests>=2.31.0",
10
+ "langchain>=0.2.0",
11
+ "langchain-community>=0.2.0",
12
+ "langchain-openai>=0.1.7",
13
+ "langchain-pinecone>=0.1.1",
14
+ "polars>=0.20.25",
15
+ # "duckdb>=0.10.2",
16
+ # "duckdb-engine>=0.12.0",
17
+ "python-dotenv>=1.0.1",
18
+ "dagster>=1.7.8",
19
+ "dagster-webserver>=1.7.8",
20
+ "dagster-openai>=0.23.9",
21
+ ]
22
+ readme = "README.md"
23
+ requires-python = ">= 3.8"
24
+
25
+ [build-system]
26
+ requires = ["hatchling"]
27
+ build-backend = "hatchling.build"
28
+
29
+ [tool.rye]
30
+ managed = true
31
+ dev-dependencies = [
32
+ "ipython>=8.24.0",
33
+ "sourcery-cli>=1.18.0",
34
+ "ipdb>=0.13.13",
35
+ ]
36
+
37
+ [tool.hatch.metadata]
38
+ allow-direct-references = true
39
+
40
+ [tool.hatch.build.targets.wheel]
41
+ packages = ["src/semantic_catalogue"]
42
+
43
+ [tool.dagster]
44
+ module_name = "src"
requirements-dev.lock ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by rye
2
+ # use `rye lock` or `rye sync` to update this lockfile
3
+ #
4
+ # last locked with the following flags:
5
+ # pre: false
6
+ # features: []
7
+ # all-features: false
8
+ # with-sources: false
9
+ # generate-hashes: false
10
+
11
+ -e file:.
12
+ aiohttp==3.9.5
13
+ # via langchain
14
+ # via langchain-community
15
+ aiosignal==1.3.1
16
+ # via aiohttp
17
+ alembic==1.13.1
18
+ # via dagster
19
+ aniso8601==9.0.1
20
+ # via graphene
21
+ annotated-types==0.7.0
22
+ # via pydantic
23
+ anyio==4.4.0
24
+ # via gql
25
+ # via httpx
26
+ # via openai
27
+ # via starlette
28
+ # via watchfiles
29
+ asttokens==2.4.1
30
+ # via stack-data
31
+ attrs==23.2.0
32
+ # via aiohttp
33
+ backoff==2.2.1
34
+ # via gql
35
+ certifi==2024.6.2
36
+ # via httpcore
37
+ # via httpx
38
+ # via pinecone-client
39
+ # via requests
40
+ charset-normalizer==3.3.2
41
+ # via requests
42
+ click==8.1.7
43
+ # via dagster
44
+ # via dagster-webserver
45
+ # via uvicorn
46
+ coloredlogs==14.0
47
+ # via dagster
48
+ croniter==2.0.5
49
+ # via dagster
50
+ dagster==1.7.9
51
+ # via dagster-graphql
52
+ # via dagster-openai
53
+ # via dagster-webserver
54
+ # via semantic-catalogue
55
+ dagster-graphql==1.7.9
56
+ # via dagster-webserver
57
+ dagster-openai==0.23.9
58
+ # via semantic-catalogue
59
+ dagster-pipes==1.7.9
60
+ # via dagster
61
+ dagster-webserver==1.7.9
62
+ # via semantic-catalogue
63
+ dataclasses-json==0.6.6
64
+ # via langchain-community
65
+ decorator==5.1.1
66
+ # via ipdb
67
+ # via ipython
68
+ distro==1.9.0
69
+ # via openai
70
+ docstring-parser==0.16
71
+ # via dagster
72
+ executing==2.0.1
73
+ # via stack-data
74
+ filelock==3.14.0
75
+ # via dagster
76
+ frozenlist==1.4.1
77
+ # via aiohttp
78
+ # via aiosignal
79
+ fsspec==2024.5.0
80
+ # via universal-pathlib
81
+ gql==3.5.0
82
+ # via dagster-graphql
83
+ graphene==3.3
84
+ # via dagster-graphql
85
+ graphql-core==3.2.3
86
+ # via gql
87
+ # via graphene
88
+ # via graphql-relay
89
+ graphql-relay==3.2.0
90
+ # via graphene
91
+ greenlet==3.0.3
92
+ # via sqlalchemy
93
+ grpcio==1.64.0
94
+ # via dagster
95
+ # via grpcio-health-checking
96
+ grpcio-health-checking==1.62.2
97
+ # via dagster
98
+ h11==0.14.0
99
+ # via httpcore
100
+ # via uvicorn
101
+ httpcore==1.0.5
102
+ # via httpx
103
+ httptools==0.6.1
104
+ # via uvicorn
105
+ httpx==0.27.0
106
+ # via openai
107
+ humanfriendly==10.0
108
+ # via coloredlogs
109
+ idna==3.7
110
+ # via anyio
111
+ # via httpx
112
+ # via requests
113
+ # via yarl
114
+ ipdb==0.13.13
115
+ ipython==8.25.0
116
+ # via ipdb
117
+ jedi==0.19.1
118
+ # via ipython
119
+ jinja2==3.1.4
120
+ # via dagster
121
+ jsonpatch==1.33
122
+ # via langchain-core
123
+ jsonpointer==2.4
124
+ # via jsonpatch
125
+ langchain==0.2.1
126
+ # via langchain-community
127
+ # via semantic-catalogue
128
+ langchain-community==0.2.1
129
+ # via semantic-catalogue
130
+ langchain-core==0.2.3
131
+ # via langchain
132
+ # via langchain-community
133
+ # via langchain-openai
134
+ # via langchain-pinecone
135
+ # via langchain-text-splitters
136
+ langchain-openai==0.1.8
137
+ # via semantic-catalogue
138
+ langchain-pinecone==0.1.1
139
+ # via semantic-catalogue
140
+ langchain-text-splitters==0.2.0
141
+ # via langchain
142
+ langsmith==0.1.67
143
+ # via langchain
144
+ # via langchain-community
145
+ # via langchain-core
146
+ mako==1.3.5
147
+ # via alembic
148
+ markdown-it-py==3.0.0
149
+ # via rich
150
+ markupsafe==2.1.5
151
+ # via jinja2
152
+ # via mako
153
+ marshmallow==3.21.2
154
+ # via dataclasses-json
155
+ matplotlib-inline==0.1.7
156
+ # via ipython
157
+ mdurl==0.1.2
158
+ # via markdown-it-py
159
+ multidict==6.0.5
160
+ # via aiohttp
161
+ # via yarl
162
+ mypy-extensions==1.0.0
163
+ # via typing-inspect
164
+ numpy==1.26.4
165
+ # via langchain
166
+ # via langchain-community
167
+ # via langchain-pinecone
168
+ openai==1.30.5
169
+ # via dagster-openai
170
+ # via langchain-openai
171
+ orjson==3.10.3
172
+ # via langsmith
173
+ packaging==23.2
174
+ # via dagster
175
+ # via langchain-core
176
+ # via marshmallow
177
+ parso==0.8.4
178
+ # via jedi
179
+ pendulum==3.0.0
180
+ # via dagster
181
+ pexpect==4.9.0
182
+ # via ipython
183
+ pinecone-client==3.2.2
184
+ # via langchain-pinecone
185
+ polars==0.20.31
186
+ # via semantic-catalogue
187
+ prompt-toolkit==3.0.45
188
+ # via ipython
189
+ protobuf==4.25.3
190
+ # via dagster
191
+ # via grpcio-health-checking
192
+ ptyprocess==0.7.0
193
+ # via pexpect
194
+ pure-eval==0.2.2
195
+ # via stack-data
196
+ pydantic==2.7.2
197
+ # via dagster
198
+ # via langchain
199
+ # via langchain-core
200
+ # via langsmith
201
+ # via openai
202
+ pydantic-core==2.18.3
203
+ # via pydantic
204
+ pygments==2.18.0
205
+ # via ipython
206
+ # via rich
207
+ python-dateutil==2.9.0.post0
208
+ # via croniter
209
+ # via dagster
210
+ # via pendulum
211
+ # via time-machine
212
+ python-dotenv==1.0.1
213
+ # via dagster
214
+ # via semantic-catalogue
215
+ # via uvicorn
216
+ pytz==2024.1
217
+ # via croniter
218
+ # via dagster
219
+ pyyaml==6.0.1
220
+ # via dagster
221
+ # via langchain
222
+ # via langchain-community
223
+ # via langchain-core
224
+ # via uvicorn
225
+ regex==2024.5.15
226
+ # via tiktoken
227
+ requests==2.32.3
228
+ # via dagster
229
+ # via dagster-graphql
230
+ # via gql
231
+ # via langchain
232
+ # via langchain-community
233
+ # via langsmith
234
+ # via requests-toolbelt
235
+ # via semantic-catalogue
236
+ # via tiktoken
237
+ requests-toolbelt==1.0.0
238
+ # via gql
239
+ rich==13.7.1
240
+ # via dagster
241
+ setuptools==70.0.0
242
+ # via dagster
243
+ six==1.16.0
244
+ # via asttokens
245
+ # via python-dateutil
246
+ sniffio==1.3.1
247
+ # via anyio
248
+ # via httpx
249
+ # via openai
250
+ sourcery-cli==1.18.0
251
+ sqlalchemy==2.0.30
252
+ # via alembic
253
+ # via dagster
254
+ # via langchain
255
+ # via langchain-community
256
+ stack-data==0.6.3
257
+ # via ipython
258
+ starlette==0.37.2
259
+ # via dagster-graphql
260
+ # via dagster-webserver
261
+ structlog==24.2.0
262
+ # via dagster
263
+ tabulate==0.9.0
264
+ # via dagster
265
+ tenacity==8.3.0
266
+ # via langchain
267
+ # via langchain-community
268
+ # via langchain-core
269
+ tiktoken==0.7.0
270
+ # via langchain-openai
271
+ time-machine==2.14.1
272
+ # via pendulum
273
+ tomli==2.0.1
274
+ # via dagster
275
+ toposort==1.10
276
+ # via dagster
277
+ tqdm==4.66.4
278
+ # via dagster
279
+ # via openai
280
+ # via pinecone-client
281
+ traitlets==5.14.3
282
+ # via ipython
283
+ # via matplotlib-inline
284
+ typing-extensions==4.12.1
285
+ # via alembic
286
+ # via dagster
287
+ # via openai
288
+ # via pinecone-client
289
+ # via pydantic
290
+ # via pydantic-core
291
+ # via sqlalchemy
292
+ # via typing-inspect
293
+ typing-inspect==0.9.0
294
+ # via dataclasses-json
295
+ tzdata==2024.1
296
+ # via pendulum
297
+ universal-pathlib==0.2.2
298
+ # via dagster
299
+ urllib3==2.2.1
300
+ # via pinecone-client
301
+ # via requests
302
+ uvicorn==0.30.1
303
+ # via dagster-webserver
304
+ uvloop==0.19.0
305
+ # via uvicorn
306
+ watchdog==4.0.1
307
+ # via dagster
308
+ watchfiles==0.22.0
309
+ # via uvicorn
310
+ wcwidth==0.2.13
311
+ # via prompt-toolkit
312
+ websockets==12.0
313
+ # via uvicorn
314
+ yarl==1.9.4
315
+ # via aiohttp
316
+ # via gql
requirements.lock ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by rye
2
+ # use `rye lock` or `rye sync` to update this lockfile
3
+ #
4
+ # last locked with the following flags:
5
+ # pre: false
6
+ # features: []
7
+ # all-features: false
8
+ # with-sources: false
9
+ # generate-hashes: false
10
+
11
+ -e file:.
12
+ aiohttp==3.9.5
13
+ # via langchain
14
+ # via langchain-community
15
+ aiosignal==1.3.1
16
+ # via aiohttp
17
+ alembic==1.13.1
18
+ # via dagster
19
+ aniso8601==9.0.1
20
+ # via graphene
21
+ annotated-types==0.7.0
22
+ # via pydantic
23
+ anyio==4.4.0
24
+ # via gql
25
+ # via httpx
26
+ # via openai
27
+ # via starlette
28
+ # via watchfiles
29
+ attrs==23.2.0
30
+ # via aiohttp
31
+ backoff==2.2.1
32
+ # via gql
33
+ certifi==2024.6.2
34
+ # via httpcore
35
+ # via httpx
36
+ # via pinecone-client
37
+ # via requests
38
+ charset-normalizer==3.3.2
39
+ # via requests
40
+ click==8.1.7
41
+ # via dagster
42
+ # via dagster-webserver
43
+ # via uvicorn
44
+ coloredlogs==14.0
45
+ # via dagster
46
+ croniter==2.0.5
47
+ # via dagster
48
+ dagster==1.7.9
49
+ # via dagster-graphql
50
+ # via dagster-openai
51
+ # via dagster-webserver
52
+ # via semantic-catalogue
53
+ dagster-graphql==1.7.9
54
+ # via dagster-webserver
55
+ dagster-openai==0.23.9
56
+ # via semantic-catalogue
57
+ dagster-pipes==1.7.9
58
+ # via dagster
59
+ dagster-webserver==1.7.9
60
+ # via semantic-catalogue
61
+ dataclasses-json==0.6.6
62
+ # via langchain-community
63
+ distro==1.9.0
64
+ # via openai
65
+ docstring-parser==0.16
66
+ # via dagster
67
+ filelock==3.14.0
68
+ # via dagster
69
+ frozenlist==1.4.1
70
+ # via aiohttp
71
+ # via aiosignal
72
+ fsspec==2024.5.0
73
+ # via universal-pathlib
74
+ gql==3.5.0
75
+ # via dagster-graphql
76
+ graphene==3.3
77
+ # via dagster-graphql
78
+ graphql-core==3.2.3
79
+ # via gql
80
+ # via graphene
81
+ # via graphql-relay
82
+ graphql-relay==3.2.0
83
+ # via graphene
84
+ greenlet==3.0.3
85
+ # via sqlalchemy
86
+ grpcio==1.64.0
87
+ # via dagster
88
+ # via grpcio-health-checking
89
+ grpcio-health-checking==1.62.2
90
+ # via dagster
91
+ h11==0.14.0
92
+ # via httpcore
93
+ # via uvicorn
94
+ httpcore==1.0.5
95
+ # via httpx
96
+ httptools==0.6.1
97
+ # via uvicorn
98
+ httpx==0.27.0
99
+ # via openai
100
+ humanfriendly==10.0
101
+ # via coloredlogs
102
+ idna==3.7
103
+ # via anyio
104
+ # via httpx
105
+ # via requests
106
+ # via yarl
107
+ jinja2==3.1.4
108
+ # via dagster
109
+ jsonpatch==1.33
110
+ # via langchain-core
111
+ jsonpointer==2.4
112
+ # via jsonpatch
113
+ langchain==0.2.1
114
+ # via langchain-community
115
+ # via semantic-catalogue
116
+ langchain-community==0.2.1
117
+ # via semantic-catalogue
118
+ langchain-core==0.2.3
119
+ # via langchain
120
+ # via langchain-community
121
+ # via langchain-openai
122
+ # via langchain-pinecone
123
+ # via langchain-text-splitters
124
+ langchain-openai==0.1.8
125
+ # via semantic-catalogue
126
+ langchain-pinecone==0.1.1
127
+ # via semantic-catalogue
128
+ langchain-text-splitters==0.2.0
129
+ # via langchain
130
+ langsmith==0.1.67
131
+ # via langchain
132
+ # via langchain-community
133
+ # via langchain-core
134
+ mako==1.3.5
135
+ # via alembic
136
+ markdown-it-py==3.0.0
137
+ # via rich
138
+ markupsafe==2.1.5
139
+ # via jinja2
140
+ # via mako
141
+ marshmallow==3.21.2
142
+ # via dataclasses-json
143
+ mdurl==0.1.2
144
+ # via markdown-it-py
145
+ multidict==6.0.5
146
+ # via aiohttp
147
+ # via yarl
148
+ mypy-extensions==1.0.0
149
+ # via typing-inspect
150
+ numpy==1.26.4
151
+ # via langchain
152
+ # via langchain-community
153
+ # via langchain-pinecone
154
+ openai==1.30.5
155
+ # via dagster-openai
156
+ # via langchain-openai
157
+ orjson==3.10.3
158
+ # via langsmith
159
+ packaging==23.2
160
+ # via dagster
161
+ # via langchain-core
162
+ # via marshmallow
163
+ pendulum==3.0.0
164
+ # via dagster
165
+ pinecone-client==3.2.2
166
+ # via langchain-pinecone
167
+ polars==0.20.31
168
+ # via semantic-catalogue
169
+ protobuf==4.25.3
170
+ # via dagster
171
+ # via grpcio-health-checking
172
+ pydantic==2.7.2
173
+ # via dagster
174
+ # via langchain
175
+ # via langchain-core
176
+ # via langsmith
177
+ # via openai
178
+ pydantic-core==2.18.3
179
+ # via pydantic
180
+ pygments==2.18.0
181
+ # via rich
182
+ python-dateutil==2.9.0.post0
183
+ # via croniter
184
+ # via dagster
185
+ # via pendulum
186
+ # via time-machine
187
+ python-dotenv==1.0.1
188
+ # via dagster
189
+ # via semantic-catalogue
190
+ # via uvicorn
191
+ pytz==2024.1
192
+ # via croniter
193
+ # via dagster
194
+ pyyaml==6.0.1
195
+ # via dagster
196
+ # via langchain
197
+ # via langchain-community
198
+ # via langchain-core
199
+ # via uvicorn
200
+ regex==2024.5.15
201
+ # via tiktoken
202
+ requests==2.32.3
203
+ # via dagster
204
+ # via dagster-graphql
205
+ # via gql
206
+ # via langchain
207
+ # via langchain-community
208
+ # via langsmith
209
+ # via requests-toolbelt
210
+ # via semantic-catalogue
211
+ # via tiktoken
212
+ requests-toolbelt==1.0.0
213
+ # via gql
214
+ rich==13.7.1
215
+ # via dagster
216
+ setuptools==70.0.0
217
+ # via dagster
218
+ six==1.16.0
219
+ # via python-dateutil
220
+ sniffio==1.3.1
221
+ # via anyio
222
+ # via httpx
223
+ # via openai
224
+ sqlalchemy==2.0.30
225
+ # via alembic
226
+ # via dagster
227
+ # via langchain
228
+ # via langchain-community
229
+ starlette==0.37.2
230
+ # via dagster-graphql
231
+ # via dagster-webserver
232
+ structlog==24.2.0
233
+ # via dagster
234
+ tabulate==0.9.0
235
+ # via dagster
236
+ tenacity==8.3.0
237
+ # via langchain
238
+ # via langchain-community
239
+ # via langchain-core
240
+ tiktoken==0.7.0
241
+ # via langchain-openai
242
+ time-machine==2.14.1
243
+ # via pendulum
244
+ tomli==2.0.1
245
+ # via dagster
246
+ toposort==1.10
247
+ # via dagster
248
+ tqdm==4.66.4
249
+ # via dagster
250
+ # via openai
251
+ # via pinecone-client
252
+ typing-extensions==4.12.1
253
+ # via alembic
254
+ # via dagster
255
+ # via openai
256
+ # via pinecone-client
257
+ # via pydantic
258
+ # via pydantic-core
259
+ # via sqlalchemy
260
+ # via typing-inspect
261
+ typing-inspect==0.9.0
262
+ # via dataclasses-json
263
+ tzdata==2024.1
264
+ # via pendulum
265
+ universal-pathlib==0.2.2
266
+ # via dagster
267
+ urllib3==2.2.1
268
+ # via pinecone-client
269
+ # via requests
270
+ uvicorn==0.30.1
271
+ # via dagster-webserver
272
+ uvloop==0.19.0
273
+ # via uvicorn
274
+ watchdog==4.0.1
275
+ # via dagster
276
+ watchfiles==0.22.0
277
+ # via uvicorn
278
+ websockets==12.0
279
+ # via uvicorn
280
+ yarl==1.9.4
281
+ # via aiohttp
282
+ # via gql
src/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dagster import Definitions, load_assets_from_modules
2
+
3
+ from src.assets import adr, datastore
4
+ from src.jobs import adr_job
5
+ from src.resources import openai_resource
6
+
7
+ adr_assets = load_assets_from_modules(modules=[adr], group_name="adr_assets")
8
+ datastore_assets = load_assets_from_modules(modules=[datastore], group_name="datastore")
9
+
10
+ defs = Definitions(
11
+ assets=[*adr_assets, *datastore_assets],
12
+ jobs=[adr_job],
13
+ resources={"openai": openai_resource},
14
+ )
src/assets/adr.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import json
3
+ import logging
4
+
5
+ import polars as pl
6
+ import requests
7
+ from dagster import AssetExecutionContext, asset
8
+ from tqdm import tqdm
9
+
10
+ from src.common.utils import Paths
11
+
12
+ PAGE_SIZE = 100
13
+ API_VERSION = "2.0"
14
+ BASE_URL = "https://api-datacatalogue.adruk.org/api"
15
+
16
+
17
+ @asset
18
+ def adr_session():
19
+ session = requests.Session()
20
+ session.headers.update({"X-API-Version": API_VERSION})
21
+ return session
22
+
23
+
24
+ @asset
25
+ def adr_datasets_id(
26
+ context: AssetExecutionContext, adr_session: requests.Session
27
+ ) -> pl.DataFrame:
28
+ datasets = []
29
+ for page_number in itertools.count(start=1):
30
+ context.log.info(f"Fetching page {page_number}")
31
+ datasets_page = _fetch_datasets_page(adr_session, page_number)
32
+ if not datasets_page:
33
+ break
34
+ datasets.extend(datasets_page)
35
+ df = pl.DataFrame(datasets)
36
+ df.write_parquet(Paths.ADR / "adr_datasets_id.parquet")
37
+ return df
38
+
39
+
40
+ def _fetch_datasets_page(adr_session: requests.Session, page_number: int) -> dict:
41
+ params = {
42
+ "pageSize": PAGE_SIZE,
43
+ "pageNumber": page_number,
44
+ "include": "dataset::dataelement::dataclass",
45
+ "additionalProp1": "string",
46
+ "additionalProp2": "string",
47
+ "additionalProp3": "string",
48
+ "state": "START",
49
+ }
50
+ try:
51
+ response = adr_session.get(f"{BASE_URL}/{{sql}}/dataset", params=params)
52
+ response.raise_for_status()
53
+ return json.loads(response.content)["content"]
54
+ except requests.HTTPError as http_err:
55
+ logging.error(f"HTTP error occurred: {http_err}")
56
+ except Exception as err:
57
+ logging.error(f"Other error occurred: {err}")
58
+
59
+
60
+ @asset
61
+ def adr_datasets(
62
+ context: AssetExecutionContext,
63
+ adr_session: requests.Session,
64
+ adr_datasets_id: pl.DataFrame,
65
+ ) -> pl.DataFrame:
66
+ df = adr_datasets_id.filter(pl.col("searchResultType") == "PHYSICAL").with_columns(
67
+ pl.col("origin").struct[0].alias("origin_id")
68
+ )
69
+
70
+ datasets_list = []
71
+ for row in tqdm(df.rows(named=True), total=len(df)):
72
+ dataset = _fetch_dataset_info(context, adr_session, row)
73
+ datasets_list.append(dataset)
74
+ df = pl.DataFrame(datasets_list)
75
+ df.write_parquet(Paths.ADR / "adr_datasets.parquet")
76
+ return df
77
+
78
+
79
+ def _fetch_dataset_info(
80
+ context: AssetExecutionContext, adr_session: requests.Session, row: dict
81
+ ) -> dict:
82
+ url = f"{BASE_URL}/dataset/{row['id']}?originId={row['origin_id']}"
83
+ try:
84
+ response = adr_session.get(url)
85
+ response.raise_for_status()
86
+ except requests.HTTPError as http_err:
87
+ context.log.error(f"HTTP error occurred: {http_err}")
88
+ except Exception as err:
89
+ context.log.error(f"Other error occurred: {err}")
90
+ content = json.loads(response.content)
91
+
92
+ return {
93
+ "id": row["id"],
94
+ "name": row["title"],
95
+ "origin_name": content["origin"]["name"],
96
+ "origin_id": row["origin_id"],
97
+ "doi": content["summary"].get("doiName"),
98
+ "url": content["origin"]["link"],
99
+ "keywords": content["summary"].get("keywords"),
100
+ "abstract": content["summary"]["abstract"],
101
+ "description": content.get("documentation", {}).get("description"),
102
+ "coverage": content["coverage"],
103
+ }
104
+
105
+
106
+ @asset
107
+ def adr_descriptions(adr_datasets: pl.DataFrame) -> None:
108
+ for item in adr_datasets.rows(named=True):
109
+ with open(
110
+ Paths.ADR
111
+ / f"descriptions/{item['id']}-{item['origin_id']}-description.txt",
112
+ "w",
113
+ ) as f:
114
+ f.write(
115
+ f"Dataset Title: {item['name']}"
116
+ f"\n\nDescription: \n\n{item['description']}"
117
+ f"\n\nAbstract: \n\n{item['abstract']}"
118
+ )
src/assets/datastore.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dagster import AssetExecutionContext, asset
2
+ from dagster_openai import OpenAIResource
3
+ from dotenv import load_dotenv
4
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain_pinecone import PineconeVectorStore
7
+ from langchain_text_splitters import CharacterTextSplitter
8
+ from pinecone import Pinecone, ServerlessSpec
9
+
10
+ from src.common.utils import Consts, Paths
11
+
12
+ load_dotenv()
13
+
14
+
15
+ @asset(compute_kind="Pinecone")
16
+ def pinecone_index(context: AssetExecutionContext):
17
+ pc = Pinecone()
18
+ if Consts.INDEX_NAME in [index["name"] for index in pc.list_indexes()]:
19
+ pc.delete_index(Consts.INDEX_NAME)
20
+
21
+ pc.create_index(
22
+ name=Consts.INDEX_NAME,
23
+ dimension=Consts.EMBEDDING_DIM,
24
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
25
+ metric="cosine",
26
+ )
27
+
28
+
29
+ @asset(compute_kind="OpenAI", deps=["adr_descriptions", "pinecone_index"])
30
+ def adr_pinecone(context: AssetExecutionContext, openai: OpenAIResource):
31
+ loader = DirectoryLoader(
32
+ Paths.ADR / "descriptions",
33
+ glob="*.txt",
34
+ loader_cls=TextLoader,
35
+ use_multithreading=True,
36
+ show_progress=True,
37
+ )
38
+ documents = loader.load()
39
+ text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
40
+ docs = text_splitter.split_documents(documents)
41
+
42
+ with openai.get_client(context) as client:
43
+ embeddings = OpenAIEmbeddings(
44
+ client=client.embeddings,
45
+ model=Consts.EMBEDDING_MODEL,
46
+ )
47
+
48
+ PineconeVectorStore.from_documents(docs, embeddings, index_name=Consts.INDEX_NAME)
src/common/utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+
4
+ class Paths:
5
+ DATA = Path("data")
6
+ ADR = DATA / "adr"
7
+
8
+
9
+ class Consts:
10
+ INDEX_NAME = "data-catalogue"
11
+ EMBEDDING_MODEL = "text-embedding-3-large"
12
+ EMBEDDING_DIM = 3072
src/jobs.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from dagster import define_asset_job
2
+
3
+ adr_job = define_asset_job(
4
+ "adr",
5
+ selection=["adr_session", "adr_datasets_id", "adr_datasets", "adr_descriptions"],
6
+ )
7
+ pinecone_job = define_asset_job("pinecone", selection=["adr_pinecone"])
src/resources.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from dagster import EnvVar
2
+ from dagster_openai import OpenAIResource
3
+
4
+ openai_resource = OpenAIResource(api_key=EnvVar("OPENAI_API_KEY"))
src/sensors.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # see https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors
2
+ # cursor to check which files are new