cjber commited on
Commit
a9ae09a
·
1 Parent(s): ded7051

refactor to use langgraph + dagster

Browse files
Containerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12
2
+
3
+ ENV VIRTUAL_ENV=/usr/local
4
+
5
+ ENV DAGSTER_HOME=/opt/dagster/dagster_home/
6
+ RUN mkdir -p $DAGSTER_HOME /opt/dagster/app
7
+ COPY dagster.yaml $DAGSTER_HOME
8
+
9
+ WORKDIR /opt/dagster/app
10
+
11
+ COPY requirements.lock pyproject.toml .env README.md ./
12
+ RUN pip install --no-cache-dir -r requirements.lock
compose.yml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ datastore-postgresql:
3
+ image: postgres:11
4
+ container_name: datastore-postgresql
5
+ environment:
6
+ POSTGRES_USER: "postgres_user"
7
+ POSTGRES_PASSWORD: "postgres_password"
8
+ POSTGRES_DB: "postgres_db"
9
+ networks:
10
+ - datastore-network
11
+
12
+ datastore-code:
13
+ container_name: datastore-code
14
+ restart: always
15
+ build:
16
+ context: .
17
+ dockerfile: Containerfile
18
+ image: datastore
19
+ security_opt:
20
+ - "label:disable"
21
+ entrypoint:
22
+ - dagster
23
+ - api
24
+ - grpc
25
+ - -h
26
+ - "0.0.0.0"
27
+ - -p
28
+ - "4000"
29
+ - -m
30
+ - src.datastore
31
+ environment:
32
+ DAGSTER_POSTGRES_USER: "postgres_user"
33
+ DAGSTER_POSTGRES_PASSWORD: "postgres_password"
34
+ DAGSTER_POSTGRES_DB: "postgres_db"
35
+ DAGSTER_CURRENT_IMAGE: "datastore"
36
+ volumes:
37
+ - ./src:/opt/dagster/app/src
38
+ - ./config:/opt/dagster/app/config
39
+ - ./data:/opt/dagster/app/data
40
+ networks:
41
+ - datastore-network
42
+
43
+ datastore-server:
44
+ container_name: datastore-server
45
+ build:
46
+ context: .
47
+ dockerfile: Containerfile
48
+ entrypoint:
49
+ - dagster-webserver
50
+ - -h
51
+ - "0.0.0.0"
52
+ - -p
53
+ - "3000"
54
+ expose:
55
+ - "3000"
56
+ ports:
57
+ - 3000:3000
58
+ security_opt:
59
+ - "label:disable"
60
+ environment:
61
+ DAGSTER_POSTGRES_USER: "postgres_user"
62
+ DAGSTER_POSTGRES_PASSWORD: "postgres_password"
63
+ DAGSTER_POSTGRES_DB: "postgres_db"
64
+ volumes:
65
+ - ./src:/opt/dagster/app/src
66
+ - ./config:/opt/dagster/app/config
67
+ - ./data:/opt/dagster/app/data
68
+ networks:
69
+ - datastore-network
70
+
71
+ datastore-daemon:
72
+ container_name: datastore-daemon
73
+ restart: on-failure
74
+ build:
75
+ context: .
76
+ dockerfile: Containerfile
77
+ entrypoint:
78
+ - dagster-daemon
79
+ - run
80
+ security_opt:
81
+ - "label:disable"
82
+ environment:
83
+ DAGSTER_POSTGRES_USER: "postgres_user"
84
+ DAGSTER_POSTGRES_PASSWORD: "postgres_password"
85
+ DAGSTER_POSTGRES_DB: "postgres_db"
86
+ volumes:
87
+ - ./src:/opt/dagster/app/src
88
+ - ./config:/opt/dagster/app/config
89
+ - ./data:/opt/dagster/app/data
90
+ networks:
91
+ - datastore-network
92
+ depends_on:
93
+ - datastore-postgresql
94
+ - datastore-code
95
+
96
+ networks:
97
+ datastore-network:
98
+ driver: bridge
99
+ name: datastore-network
config/config.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [datastore]
2
+ index_name = "data-catalogue"
3
+ embed_model = "text-embedding-3-large"
4
+ embed_dim = 3072
5
+ chunk_size = 256
6
+ chunk_overlap = 32
7
+
8
+ [model]
9
+ llm = "gpt-3.5-turbo"
10
+ top_k = 30
dagster.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scheduler:
2
+ module: dagster.core.scheduler
3
+ class: DagsterDaemonScheduler
4
+
5
+ run_retries:
6
+ enabled: true
7
+ max_retries: 3
8
+
9
+ run_storage:
10
+ module: dagster_postgres.run_storage
11
+ class: PostgresRunStorage
12
+ config:
13
+ postgres_db:
14
+ hostname: datastore-postgresql
15
+ username:
16
+ env: DAGSTER_POSTGRES_USER
17
+ password:
18
+ env: DAGSTER_POSTGRES_PASSWORD
19
+ db_name:
20
+ env: DAGSTER_POSTGRES_DB
21
+ port: 5432
22
+
23
+ schedule_storage:
24
+ module: dagster_postgres.schedule_storage
25
+ class: PostgresScheduleStorage
26
+ config:
27
+ postgres_db:
28
+ hostname: datastore-postgresql
29
+ username:
30
+ env: DAGSTER_POSTGRES_USER
31
+ password:
32
+ env: DAGSTER_POSTGRES_PASSWORD
33
+ db_name:
34
+ env: DAGSTER_POSTGRES_DB
35
+ port: 5432
36
+
37
+ event_log_storage:
38
+ module: dagster_postgres.event_log
39
+ class: PostgresEventLogStorage
40
+ config:
41
+ postgres_db:
42
+ hostname: datastore-postgresql
43
+ username:
44
+ env: DAGSTER_POSTGRES_USER
45
+ password:
46
+ env: DAGSTER_POSTGRES_PASSWORD
47
+ db_name:
48
+ env: DAGSTER_POSTGRES_DB
49
+ port: 5432
50
+
51
+ telemetry:
52
+ enabled: false
pyproject.toml CHANGED
@@ -6,18 +6,26 @@ authors = [
6
  { name = "cjber", email = "cjberragan@gmail.com" }
7
  ]
8
  dependencies = [
9
- "requests>=2.31.0",
10
- "langchain>=0.2.0",
11
- "langchain-community>=0.2.0",
12
- "langchain-openai>=0.1.7",
13
- "langchain-pinecone>=0.1.1",
14
- "polars>=0.20.25",
15
- # "duckdb>=0.10.2",
16
- # "duckdb-engine>=0.12.0",
17
- "python-dotenv>=1.0.1",
18
  "dagster>=1.7.8",
19
  "dagster-webserver>=1.7.8",
 
20
  "dagster-openai>=0.23.9",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  ]
22
  readme = "README.md"
23
  requires-python = ">= 3.8"
@@ -41,4 +49,4 @@ allow-direct-references = true
41
  packages = ["src/semantic_catalogue"]
42
 
43
  [tool.dagster]
44
- module_name = "src"
 
6
  { name = "cjber", email = "cjberragan@gmail.com" }
7
  ]
8
  dependencies = [
 
 
 
 
 
 
 
 
 
9
  "dagster>=1.7.8",
10
  "dagster-webserver>=1.7.8",
11
+ "dagster-postgres>=0.23.15",
12
  "dagster-openai>=0.23.9",
13
+ "requests>=2.31.0",
14
+ "polars>=0.20.25",
15
+ "python-dotenv>=1.0.1",
16
+ "sickle>=0.7.0",
17
+ "lxml>=5.2.2",
18
+ "pydantic-settings>=2.3.4",
19
+ "pdfminer-six>=20240706",
20
+ "dateparser>=1.2.0",
21
+ "langchain>=0.2.12",
22
+ "langchain-community>=0.2.10",
23
+ "langchain-pinecone>=0.1.3",
24
+ "langchain-openai>=0.1.20",
25
+ "langchain-experimental>=0.0.63",
26
+ "langgraph>=0.1.19",
27
+ "langchainhub>=0.1.20",
28
+ "fastapi[standard]>=0.112.0",
29
  ]
30
  readme = "README.md"
31
  requires-python = ">= 3.8"
 
49
  packages = ["src/semantic_catalogue"]
50
 
51
  [tool.dagster]
52
+ module_name = "src.datastore"
reports/DOCS.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Introduction
2
+
3
+ * Unify search across catalogues
4
+ * Uses semantic search with RAG for results explainability
5
+ * Llama Index framework
6
+
7
+ # Methodology
8
+
9
+ ## Pre-processing
10
+
11
+ For each catalogue their respective API was used to return dataset metadata. Each returned result contained descriptive information regarding datasets, which form the bulk of text data used by the semantic search system to return results. For the CDRC catalogue, PDFs were also processed to extract text. Other metadata was also returned which may be used by the final system; for example, data creation date.
12
+
13
+ ## Datastore
14
+
15
+ The description of each dataset were then saved into individual text files, identifiable by a unique ID. These files were then embedded using OpenAI embeddings, and uploaded to the Pinecone database, alongside any metadata. Descriptions were 'chunked' into individual segments 1024 tokens in length. For each chunk, the dataset title as embedded at the start.
16
+
17
+ ## RAG Model
18
+
19
+ A RAG system was then built which embeds a user query using the same embedding model, and returns the top 'k' results ranked by cosine similarity from the Pinecone database. To ensure that results are ranked by dataset, a custom document grouping postprocessor was defined, which grouped all document chunks relating to the same dataset. The highest score from any chunk is used to rank grouped documents.
20
+
21
+ An adjustable 'alpha' value was used to allow for a mixture of traditional 'sparse vector' search (e.g. BM25: keyword search), and the 'dense vector' search, using the LLM embeddings.
22
+
23
+ For each unique document returned, an explainable 'Ask AI' option was added, which feeds the grouped document into a GPT LLM with the following prompt:
24
+
25
+ ```python
26
+ prompt = """
27
+ Below is a dataset description that is relevant to a researchers query.
28
+
29
+ Explain the relevance of this dataset to the query in under 50 words. Use your own knowledge or the data profile. Do not say it is unrelated; attempt to find a relevant connection.
30
+
31
+ ---------------------
32
+ Query: "{query_str}"
33
+
34
+ Dataset description:
35
+
36
+ {context_str}
37
+ ---------------------
38
+ """
39
+ ```
40
+ This approach ensures that users receive not only relevant search results but also understandable explanations regarding the relevance of each dataset to their query.
41
+
42
+ # System architecture
43
+
44
+ ## Overview
45
+
46
+ ![]('./figs/system.png')
47
+
48
+ ## Data flow
49
+
50
+ (Describe the flow of data from the catalogues to the end-user.)
51
+
52
+ ## Implementation details
53
+
54
+ * Tools and Libraries: OpenAI API, Pinecone, Llama Index
55
+ * Challenges: (Detail any challenges and solutions.)
56
+
57
+ # Evaluation and results
58
+
59
+ * Performance Metrics: Search accuracy, response time, user feedback
60
+ * Comparison: Effectiveness of keyword search vs. dense vector search
61
+
62
+ # Future work and improvements
63
+
64
+ * Potential improvements and future enhancements
65
+ * Discuss limitations of the current implementation
66
+
67
+ # Conclusion
68
+
69
+ Summarise the key points and the impact of the unified search system.
70
+ References
71
+
72
+ (List any academic papers, tools, or libraries referenced.)
reports/figs/system.png ADDED
requirements-dev.lock CHANGED
@@ -7,14 +7,16 @@
7
  # all-features: false
8
  # with-sources: false
9
  # generate-hashes: false
 
10
 
11
  -e file:.
12
  aiohttp==3.9.5
13
  # via langchain
14
  # via langchain-community
 
15
  aiosignal==1.3.1
16
  # via aiohttp
17
- alembic==1.13.1
18
  # via dagster
19
  aniso8601==9.0.1
20
  # via graphene
@@ -32,51 +34,70 @@ attrs==23.2.0
32
  # via aiohttp
33
  backoff==2.2.1
34
  # via gql
35
- certifi==2024.6.2
36
  # via httpcore
37
  # via httpx
38
  # via pinecone-client
39
  # via requests
 
 
40
  charset-normalizer==3.3.2
 
41
  # via requests
42
  click==8.1.7
43
  # via dagster
44
  # via dagster-webserver
 
45
  # via uvicorn
46
  coloredlogs==14.0
47
  # via dagster
48
- croniter==2.0.5
49
  # via dagster
50
- dagster==1.7.9
 
 
51
  # via dagster-graphql
52
  # via dagster-openai
 
53
  # via dagster-webserver
54
  # via semantic-catalogue
55
- dagster-graphql==1.7.9
56
  # via dagster-webserver
57
- dagster-openai==0.23.9
58
  # via semantic-catalogue
59
- dagster-pipes==1.7.9
60
  # via dagster
61
- dagster-webserver==1.7.9
 
 
62
  # via semantic-catalogue
63
- dataclasses-json==0.6.6
64
  # via langchain-community
 
 
65
  decorator==5.1.1
66
  # via ipdb
67
  # via ipython
68
  distro==1.9.0
69
  # via openai
 
 
70
  docstring-parser==0.16
71
  # via dagster
 
 
72
  executing==2.0.1
73
  # via stack-data
74
- filelock==3.14.0
 
 
 
 
75
  # via dagster
76
  frozenlist==1.4.1
77
  # via aiohttp
78
  # via aiosignal
79
- fsspec==2024.5.0
80
  # via universal-pathlib
81
  gql==3.5.0
82
  # via dagster-graphql
@@ -90,7 +111,7 @@ graphql-relay==3.2.0
90
  # via graphene
91
  greenlet==3.0.3
92
  # via sqlalchemy
93
- grpcio==1.64.0
94
  # via dagster
95
  # via grpcio-health-checking
96
  grpcio-health-checking==1.62.2
@@ -103,46 +124,61 @@ httpcore==1.0.5
103
  httptools==0.6.1
104
  # via uvicorn
105
  httpx==0.27.0
 
106
  # via openai
107
  humanfriendly==10.0
108
  # via coloredlogs
109
  idna==3.7
110
  # via anyio
 
111
  # via httpx
112
  # via requests
113
  # via yarl
114
  ipdb==0.13.13
115
- ipython==8.25.0
116
  # via ipdb
117
  jedi==0.19.1
118
  # via ipython
119
  jinja2==3.1.4
120
  # via dagster
 
121
  jsonpatch==1.33
122
  # via langchain-core
123
- jsonpointer==2.4
124
  # via jsonpatch
125
- langchain==0.2.1
126
  # via langchain-community
127
  # via semantic-catalogue
128
- langchain-community==0.2.1
 
129
  # via semantic-catalogue
130
- langchain-core==0.2.3
131
  # via langchain
132
  # via langchain-community
 
133
  # via langchain-openai
134
  # via langchain-pinecone
135
  # via langchain-text-splitters
136
- langchain-openai==0.1.8
 
137
  # via semantic-catalogue
138
- langchain-pinecone==0.1.1
139
  # via semantic-catalogue
140
- langchain-text-splitters==0.2.0
 
 
141
  # via langchain
142
- langsmith==0.1.67
 
 
 
 
143
  # via langchain
144
  # via langchain-community
145
  # via langchain-core
 
 
 
146
  mako==1.3.5
147
  # via alembic
148
  markdown-it-py==3.0.0
@@ -150,7 +186,7 @@ markdown-it-py==3.0.0
150
  markupsafe==2.1.5
151
  # via jinja2
152
  # via mako
153
- marshmallow==3.21.2
154
  # via dataclasses-json
155
  matplotlib-inline==0.1.7
156
  # via ipython
@@ -165,64 +201,85 @@ numpy==1.26.4
165
  # via langchain
166
  # via langchain-community
167
  # via langchain-pinecone
168
- openai==1.30.5
169
  # via dagster-openai
170
  # via langchain-openai
171
- orjson==3.10.3
172
  # via langsmith
173
- packaging==23.2
174
  # via dagster
175
  # via langchain-core
 
176
  # via marshmallow
177
  parso==0.8.4
178
  # via jedi
 
 
179
  pendulum==3.0.0
180
  # via dagster
181
  pexpect==4.9.0
182
  # via ipython
183
- pinecone-client==3.2.2
184
  # via langchain-pinecone
185
- polars==0.20.31
 
 
 
 
 
186
  # via semantic-catalogue
187
- prompt-toolkit==3.0.45
188
  # via ipython
189
- protobuf==4.25.3
190
  # via dagster
191
  # via grpcio-health-checking
 
 
192
  ptyprocess==0.7.0
193
  # via pexpect
194
- pure-eval==0.2.2
195
  # via stack-data
196
- pydantic==2.7.2
 
 
197
  # via dagster
 
198
  # via langchain
199
  # via langchain-core
200
  # via langsmith
201
  # via openai
202
- pydantic-core==2.18.3
 
203
  # via pydantic
 
 
204
  pygments==2.18.0
205
  # via ipython
206
  # via rich
207
  python-dateutil==2.9.0.post0
208
  # via croniter
209
- # via dagster
210
  # via pendulum
211
  # via time-machine
212
  python-dotenv==1.0.1
213
  # via dagster
 
214
  # via semantic-catalogue
215
  # via uvicorn
 
 
216
  pytz==2024.1
217
  # via croniter
218
  # via dagster
 
219
  pyyaml==6.0.1
220
  # via dagster
221
  # via langchain
222
  # via langchain-community
223
  # via langchain-core
224
  # via uvicorn
225
- regex==2024.5.15
 
226
  # via tiktoken
227
  requests==2.32.3
228
  # via dagster
@@ -230,16 +287,23 @@ requests==2.32.3
230
  # via gql
231
  # via langchain
232
  # via langchain-community
 
233
  # via langsmith
234
  # via requests-toolbelt
235
  # via semantic-catalogue
 
236
  # via tiktoken
237
  requests-toolbelt==1.0.0
238
  # via gql
239
  rich==13.7.1
240
  # via dagster
241
- setuptools==70.0.0
 
242
  # via dagster
 
 
 
 
243
  six==1.16.0
244
  # via asttokens
245
  # via python-dateutil
@@ -248,7 +312,7 @@ sniffio==1.3.1
248
  # via httpx
249
  # via openai
250
  sourcery-cli==1.18.0
251
- sqlalchemy==2.0.30
252
  # via alembic
253
  # via dagster
254
  # via langchain
@@ -258,17 +322,18 @@ stack-data==0.6.3
258
  starlette==0.37.2
259
  # via dagster-graphql
260
  # via dagster-webserver
261
- structlog==24.2.0
 
262
  # via dagster
263
  tabulate==0.9.0
264
  # via dagster
265
- tenacity==8.3.0
266
  # via langchain
267
  # via langchain-community
268
  # via langchain-core
269
  tiktoken==0.7.0
270
  # via langchain-openai
271
- time-machine==2.14.1
272
  # via pendulum
273
  tomli==2.0.1
274
  # via dagster
@@ -281,26 +346,38 @@ tqdm==4.66.4
281
  traitlets==5.14.3
282
  # via ipython
283
  # via matplotlib-inline
284
- typing-extensions==4.12.1
 
 
 
 
285
  # via alembic
286
  # via dagster
 
 
287
  # via openai
288
  # via pinecone-client
289
  # via pydantic
290
  # via pydantic-core
291
  # via sqlalchemy
 
292
  # via typing-inspect
293
  typing-inspect==0.9.0
294
  # via dataclasses-json
295
  tzdata==2024.1
296
  # via pendulum
 
 
297
  universal-pathlib==0.2.2
298
  # via dagster
299
- urllib3==2.2.1
300
  # via pinecone-client
301
  # via requests
302
- uvicorn==0.30.1
 
303
  # via dagster-webserver
 
 
304
  uvloop==0.19.0
305
  # via uvicorn
306
  watchdog==4.0.1
 
7
  # all-features: false
8
  # with-sources: false
9
  # generate-hashes: false
10
+ # universal: false
11
 
12
  -e file:.
13
  aiohttp==3.9.5
14
  # via langchain
15
  # via langchain-community
16
+ # via langchain-pinecone
17
  aiosignal==1.3.1
18
  # via aiohttp
19
+ alembic==1.13.2
20
  # via dagster
21
  aniso8601==9.0.1
22
  # via graphene
 
34
  # via aiohttp
35
  backoff==2.2.1
36
  # via gql
37
+ certifi==2024.7.4
38
  # via httpcore
39
  # via httpx
40
  # via pinecone-client
41
  # via requests
42
+ cffi==1.16.0
43
+ # via cryptography
44
  charset-normalizer==3.3.2
45
+ # via pdfminer-six
46
  # via requests
47
  click==8.1.7
48
  # via dagster
49
  # via dagster-webserver
50
+ # via typer
51
  # via uvicorn
52
  coloredlogs==14.0
53
  # via dagster
54
+ croniter==3.0.3
55
  # via dagster
56
+ cryptography==43.0.0
57
+ # via pdfminer-six
58
+ dagster==1.7.15
59
  # via dagster-graphql
60
  # via dagster-openai
61
+ # via dagster-postgres
62
  # via dagster-webserver
63
  # via semantic-catalogue
64
+ dagster-graphql==1.7.15
65
  # via dagster-webserver
66
+ dagster-openai==0.23.15
67
  # via semantic-catalogue
68
+ dagster-pipes==1.7.15
69
  # via dagster
70
+ dagster-postgres==0.23.15
71
+ # via semantic-catalogue
72
+ dagster-webserver==1.7.15
73
  # via semantic-catalogue
74
+ dataclasses-json==0.6.7
75
  # via langchain-community
76
+ dateparser==1.2.0
77
+ # via semantic-catalogue
78
  decorator==5.1.1
79
  # via ipdb
80
  # via ipython
81
  distro==1.9.0
82
  # via openai
83
+ dnspython==2.6.1
84
+ # via email-validator
85
  docstring-parser==0.16
86
  # via dagster
87
+ email-validator==2.2.0
88
+ # via fastapi
89
  executing==2.0.1
90
  # via stack-data
91
+ fastapi==0.112.0
92
+ # via semantic-catalogue
93
+ fastapi-cli==0.0.5
94
+ # via fastapi
95
+ filelock==3.15.4
96
  # via dagster
97
  frozenlist==1.4.1
98
  # via aiohttp
99
  # via aiosignal
100
+ fsspec==2024.6.1
101
  # via universal-pathlib
102
  gql==3.5.0
103
  # via dagster-graphql
 
111
  # via graphene
112
  greenlet==3.0.3
113
  # via sqlalchemy
114
+ grpcio==1.64.1
115
  # via dagster
116
  # via grpcio-health-checking
117
  grpcio-health-checking==1.62.2
 
124
  httptools==0.6.1
125
  # via uvicorn
126
  httpx==0.27.0
127
+ # via fastapi
128
  # via openai
129
  humanfriendly==10.0
130
  # via coloredlogs
131
  idna==3.7
132
  # via anyio
133
+ # via email-validator
134
  # via httpx
135
  # via requests
136
  # via yarl
137
  ipdb==0.13.13
138
+ ipython==8.26.0
139
  # via ipdb
140
  jedi==0.19.1
141
  # via ipython
142
  jinja2==3.1.4
143
  # via dagster
144
+ # via fastapi
145
  jsonpatch==1.33
146
  # via langchain-core
147
+ jsonpointer==3.0.0
148
  # via jsonpatch
149
+ langchain==0.2.12
150
  # via langchain-community
151
  # via semantic-catalogue
152
+ langchain-community==0.2.10
153
+ # via langchain-experimental
154
  # via semantic-catalogue
155
+ langchain-core==0.2.27
156
  # via langchain
157
  # via langchain-community
158
+ # via langchain-experimental
159
  # via langchain-openai
160
  # via langchain-pinecone
161
  # via langchain-text-splitters
162
+ # via langgraph
163
+ langchain-experimental==0.0.63
164
  # via semantic-catalogue
165
+ langchain-openai==0.1.20
166
  # via semantic-catalogue
167
+ langchain-pinecone==0.1.3
168
+ # via semantic-catalogue
169
+ langchain-text-splitters==0.2.2
170
  # via langchain
171
+ langchainhub==0.1.20
172
+ # via semantic-catalogue
173
+ langgraph==0.1.19
174
+ # via semantic-catalogue
175
+ langsmith==0.1.96
176
  # via langchain
177
  # via langchain-community
178
  # via langchain-core
179
+ lxml==5.2.2
180
+ # via semantic-catalogue
181
+ # via sickle
182
  mako==1.3.5
183
  # via alembic
184
  markdown-it-py==3.0.0
 
186
  markupsafe==2.1.5
187
  # via jinja2
188
  # via mako
189
+ marshmallow==3.21.3
190
  # via dataclasses-json
191
  matplotlib-inline==0.1.7
192
  # via ipython
 
201
  # via langchain
202
  # via langchain-community
203
  # via langchain-pinecone
204
+ openai==1.37.1
205
  # via dagster-openai
206
  # via langchain-openai
207
+ orjson==3.10.6
208
  # via langsmith
209
+ packaging==24.1
210
  # via dagster
211
  # via langchain-core
212
+ # via langchainhub
213
  # via marshmallow
214
  parso==0.8.4
215
  # via jedi
216
+ pdfminer-six==20240706
217
+ # via semantic-catalogue
218
  pendulum==3.0.0
219
  # via dagster
220
  pexpect==4.9.0
221
  # via ipython
222
+ pinecone-client==5.0.1
223
  # via langchain-pinecone
224
+ pinecone-plugin-inference==1.0.3
225
+ # via pinecone-client
226
+ pinecone-plugin-interface==0.0.7
227
+ # via pinecone-client
228
+ # via pinecone-plugin-inference
229
+ polars==1.3.0
230
  # via semantic-catalogue
231
+ prompt-toolkit==3.0.47
232
  # via ipython
233
+ protobuf==4.25.4
234
  # via dagster
235
  # via grpcio-health-checking
236
+ psycopg2-binary==2.9.9
237
+ # via dagster-postgres
238
  ptyprocess==0.7.0
239
  # via pexpect
240
+ pure-eval==0.2.3
241
  # via stack-data
242
+ pycparser==2.22
243
+ # via cffi
244
+ pydantic==2.8.2
245
  # via dagster
246
+ # via fastapi
247
  # via langchain
248
  # via langchain-core
249
  # via langsmith
250
  # via openai
251
+ # via pydantic-settings
252
+ pydantic-core==2.20.1
253
  # via pydantic
254
+ pydantic-settings==2.3.4
255
+ # via semantic-catalogue
256
  pygments==2.18.0
257
  # via ipython
258
  # via rich
259
  python-dateutil==2.9.0.post0
260
  # via croniter
261
+ # via dateparser
262
  # via pendulum
263
  # via time-machine
264
  python-dotenv==1.0.1
265
  # via dagster
266
+ # via pydantic-settings
267
  # via semantic-catalogue
268
  # via uvicorn
269
+ python-multipart==0.0.9
270
+ # via fastapi
271
  pytz==2024.1
272
  # via croniter
273
  # via dagster
274
+ # via dateparser
275
  pyyaml==6.0.1
276
  # via dagster
277
  # via langchain
278
  # via langchain-community
279
  # via langchain-core
280
  # via uvicorn
281
+ regex==2024.7.24
282
+ # via dateparser
283
  # via tiktoken
284
  requests==2.32.3
285
  # via dagster
 
287
  # via gql
288
  # via langchain
289
  # via langchain-community
290
+ # via langchainhub
291
  # via langsmith
292
  # via requests-toolbelt
293
  # via semantic-catalogue
294
+ # via sickle
295
  # via tiktoken
296
  requests-toolbelt==1.0.0
297
  # via gql
298
  rich==13.7.1
299
  # via dagster
300
+ # via typer
301
+ setuptools==72.1.0
302
  # via dagster
303
+ shellingham==1.5.4
304
+ # via typer
305
+ sickle==0.7.0
306
+ # via semantic-catalogue
307
  six==1.16.0
308
  # via asttokens
309
  # via python-dateutil
 
312
  # via httpx
313
  # via openai
314
  sourcery-cli==1.18.0
315
+ sqlalchemy==2.0.31
316
  # via alembic
317
  # via dagster
318
  # via langchain
 
322
  starlette==0.37.2
323
  # via dagster-graphql
324
  # via dagster-webserver
325
+ # via fastapi
326
+ structlog==24.4.0
327
  # via dagster
328
  tabulate==0.9.0
329
  # via dagster
330
+ tenacity==8.5.0
331
  # via langchain
332
  # via langchain-community
333
  # via langchain-core
334
  tiktoken==0.7.0
335
  # via langchain-openai
336
+ time-machine==2.14.2
337
  # via pendulum
338
  tomli==2.0.1
339
  # via dagster
 
346
  traitlets==5.14.3
347
  # via ipython
348
  # via matplotlib-inline
349
+ typer==0.12.3
350
+ # via fastapi-cli
351
+ types-requests==2.32.0.20240712
352
+ # via langchainhub
353
+ typing-extensions==4.12.2
354
  # via alembic
355
  # via dagster
356
+ # via fastapi
357
+ # via langchain-core
358
  # via openai
359
  # via pinecone-client
360
  # via pydantic
361
  # via pydantic-core
362
  # via sqlalchemy
363
+ # via typer
364
  # via typing-inspect
365
  typing-inspect==0.9.0
366
  # via dataclasses-json
367
  tzdata==2024.1
368
  # via pendulum
369
+ tzlocal==5.2
370
+ # via dateparser
371
  universal-pathlib==0.2.2
372
  # via dagster
373
+ urllib3==2.2.2
374
  # via pinecone-client
375
  # via requests
376
+ # via types-requests
377
+ uvicorn==0.30.3
378
  # via dagster-webserver
379
+ # via fastapi
380
+ # via fastapi-cli
381
  uvloop==0.19.0
382
  # via uvicorn
383
  watchdog==4.0.1
requirements.lock CHANGED
@@ -7,14 +7,16 @@
7
  # all-features: false
8
  # with-sources: false
9
  # generate-hashes: false
 
10
 
11
  -e file:.
12
  aiohttp==3.9.5
13
  # via langchain
14
  # via langchain-community
 
15
  aiosignal==1.3.1
16
  # via aiohttp
17
- alembic==1.13.1
18
  # via dagster
19
  aniso8601==9.0.1
20
  # via graphene
@@ -30,46 +32,65 @@ attrs==23.2.0
30
  # via aiohttp
31
  backoff==2.2.1
32
  # via gql
33
- certifi==2024.6.2
34
  # via httpcore
35
  # via httpx
36
  # via pinecone-client
37
  # via requests
 
 
38
  charset-normalizer==3.3.2
 
39
  # via requests
40
  click==8.1.7
41
  # via dagster
42
  # via dagster-webserver
 
43
  # via uvicorn
44
  coloredlogs==14.0
45
  # via dagster
46
- croniter==2.0.5
47
  # via dagster
48
- dagster==1.7.9
 
 
49
  # via dagster-graphql
50
  # via dagster-openai
 
51
  # via dagster-webserver
52
  # via semantic-catalogue
53
- dagster-graphql==1.7.9
54
  # via dagster-webserver
55
- dagster-openai==0.23.9
56
  # via semantic-catalogue
57
- dagster-pipes==1.7.9
58
  # via dagster
59
- dagster-webserver==1.7.9
 
 
60
  # via semantic-catalogue
61
- dataclasses-json==0.6.6
62
  # via langchain-community
 
 
63
  distro==1.9.0
64
  # via openai
 
 
65
  docstring-parser==0.16
66
  # via dagster
67
- filelock==3.14.0
 
 
 
 
 
 
68
  # via dagster
69
  frozenlist==1.4.1
70
  # via aiohttp
71
  # via aiosignal
72
- fsspec==2024.5.0
73
  # via universal-pathlib
74
  gql==3.5.0
75
  # via dagster-graphql
@@ -83,7 +104,7 @@ graphql-relay==3.2.0
83
  # via graphene
84
  greenlet==3.0.3
85
  # via sqlalchemy
86
- grpcio==1.64.0
87
  # via dagster
88
  # via grpcio-health-checking
89
  grpcio-health-checking==1.62.2
@@ -96,41 +117,56 @@ httpcore==1.0.5
96
  httptools==0.6.1
97
  # via uvicorn
98
  httpx==0.27.0
 
99
  # via openai
100
  humanfriendly==10.0
101
  # via coloredlogs
102
  idna==3.7
103
  # via anyio
 
104
  # via httpx
105
  # via requests
106
  # via yarl
107
  jinja2==3.1.4
108
  # via dagster
 
109
  jsonpatch==1.33
110
  # via langchain-core
111
- jsonpointer==2.4
112
  # via jsonpatch
113
- langchain==0.2.1
114
  # via langchain-community
115
  # via semantic-catalogue
116
- langchain-community==0.2.1
 
117
  # via semantic-catalogue
118
- langchain-core==0.2.3
119
  # via langchain
120
  # via langchain-community
 
121
  # via langchain-openai
122
  # via langchain-pinecone
123
  # via langchain-text-splitters
124
- langchain-openai==0.1.8
 
125
  # via semantic-catalogue
126
- langchain-pinecone==0.1.1
127
  # via semantic-catalogue
128
- langchain-text-splitters==0.2.0
 
 
129
  # via langchain
130
- langsmith==0.1.67
 
 
 
 
131
  # via langchain
132
  # via langchain-community
133
  # via langchain-core
 
 
 
134
  mako==1.3.5
135
  # via alembic
136
  markdown-it-py==3.0.0
@@ -138,7 +174,7 @@ markdown-it-py==3.0.0
138
  markupsafe==2.1.5
139
  # via jinja2
140
  # via mako
141
- marshmallow==3.21.2
142
  # via dataclasses-json
143
  mdurl==0.1.2
144
  # via markdown-it-py
@@ -151,53 +187,74 @@ numpy==1.26.4
151
  # via langchain
152
  # via langchain-community
153
  # via langchain-pinecone
154
- openai==1.30.5
155
  # via dagster-openai
156
  # via langchain-openai
157
- orjson==3.10.3
158
  # via langsmith
159
- packaging==23.2
160
  # via dagster
161
  # via langchain-core
 
162
  # via marshmallow
 
 
163
  pendulum==3.0.0
164
  # via dagster
165
- pinecone-client==3.2.2
166
  # via langchain-pinecone
167
- polars==0.20.31
 
 
 
 
 
168
  # via semantic-catalogue
169
- protobuf==4.25.3
170
  # via dagster
171
  # via grpcio-health-checking
172
- pydantic==2.7.2
 
 
 
 
173
  # via dagster
 
174
  # via langchain
175
  # via langchain-core
176
  # via langsmith
177
  # via openai
178
- pydantic-core==2.18.3
 
179
  # via pydantic
 
 
180
  pygments==2.18.0
181
  # via rich
182
  python-dateutil==2.9.0.post0
183
  # via croniter
184
- # via dagster
185
  # via pendulum
186
  # via time-machine
187
  python-dotenv==1.0.1
188
  # via dagster
 
189
  # via semantic-catalogue
190
  # via uvicorn
 
 
191
  pytz==2024.1
192
  # via croniter
193
  # via dagster
 
194
  pyyaml==6.0.1
195
  # via dagster
196
  # via langchain
197
  # via langchain-community
198
  # via langchain-core
199
  # via uvicorn
200
- regex==2024.5.15
 
201
  # via tiktoken
202
  requests==2.32.3
203
  # via dagster
@@ -205,23 +262,30 @@ requests==2.32.3
205
  # via gql
206
  # via langchain
207
  # via langchain-community
 
208
  # via langsmith
209
  # via requests-toolbelt
210
  # via semantic-catalogue
 
211
  # via tiktoken
212
  requests-toolbelt==1.0.0
213
  # via gql
214
  rich==13.7.1
215
  # via dagster
216
- setuptools==70.0.0
 
217
  # via dagster
 
 
 
 
218
  six==1.16.0
219
  # via python-dateutil
220
  sniffio==1.3.1
221
  # via anyio
222
  # via httpx
223
  # via openai
224
- sqlalchemy==2.0.30
225
  # via alembic
226
  # via dagster
227
  # via langchain
@@ -229,17 +293,18 @@ sqlalchemy==2.0.30
229
  starlette==0.37.2
230
  # via dagster-graphql
231
  # via dagster-webserver
232
- structlog==24.2.0
 
233
  # via dagster
234
  tabulate==0.9.0
235
  # via dagster
236
- tenacity==8.3.0
237
  # via langchain
238
  # via langchain-community
239
  # via langchain-core
240
  tiktoken==0.7.0
241
  # via langchain-openai
242
- time-machine==2.14.1
243
  # via pendulum
244
  tomli==2.0.1
245
  # via dagster
@@ -249,26 +314,38 @@ tqdm==4.66.4
249
  # via dagster
250
  # via openai
251
  # via pinecone-client
252
- typing-extensions==4.12.1
 
 
 
 
253
  # via alembic
254
  # via dagster
 
 
255
  # via openai
256
  # via pinecone-client
257
  # via pydantic
258
  # via pydantic-core
259
  # via sqlalchemy
 
260
  # via typing-inspect
261
  typing-inspect==0.9.0
262
  # via dataclasses-json
263
  tzdata==2024.1
264
  # via pendulum
 
 
265
  universal-pathlib==0.2.2
266
  # via dagster
267
- urllib3==2.2.1
268
  # via pinecone-client
269
  # via requests
270
- uvicorn==0.30.1
 
271
  # via dagster-webserver
 
 
272
  uvloop==0.19.0
273
  # via uvicorn
274
  watchdog==4.0.1
 
7
  # all-features: false
8
  # with-sources: false
9
  # generate-hashes: false
10
+ # universal: false
11
 
12
  -e file:.
13
  aiohttp==3.9.5
14
  # via langchain
15
  # via langchain-community
16
+ # via langchain-pinecone
17
  aiosignal==1.3.1
18
  # via aiohttp
19
+ alembic==1.13.2
20
  # via dagster
21
  aniso8601==9.0.1
22
  # via graphene
 
32
  # via aiohttp
33
  backoff==2.2.1
34
  # via gql
35
+ certifi==2024.7.4
36
  # via httpcore
37
  # via httpx
38
  # via pinecone-client
39
  # via requests
40
+ cffi==1.16.0
41
+ # via cryptography
42
  charset-normalizer==3.3.2
43
+ # via pdfminer-six
44
  # via requests
45
  click==8.1.7
46
  # via dagster
47
  # via dagster-webserver
48
+ # via typer
49
  # via uvicorn
50
  coloredlogs==14.0
51
  # via dagster
52
+ croniter==3.0.3
53
  # via dagster
54
+ cryptography==43.0.0
55
+ # via pdfminer-six
56
+ dagster==1.7.15
57
  # via dagster-graphql
58
  # via dagster-openai
59
+ # via dagster-postgres
60
  # via dagster-webserver
61
  # via semantic-catalogue
62
+ dagster-graphql==1.7.15
63
  # via dagster-webserver
64
+ dagster-openai==0.23.15
65
  # via semantic-catalogue
66
+ dagster-pipes==1.7.15
67
  # via dagster
68
+ dagster-postgres==0.23.15
69
+ # via semantic-catalogue
70
+ dagster-webserver==1.7.15
71
  # via semantic-catalogue
72
+ dataclasses-json==0.6.7
73
  # via langchain-community
74
+ dateparser==1.2.0
75
+ # via semantic-catalogue
76
  distro==1.9.0
77
  # via openai
78
+ dnspython==2.6.1
79
+ # via email-validator
80
  docstring-parser==0.16
81
  # via dagster
82
+ email-validator==2.2.0
83
+ # via fastapi
84
+ fastapi==0.112.0
85
+ # via semantic-catalogue
86
+ fastapi-cli==0.0.5
87
+ # via fastapi
88
+ filelock==3.15.4
89
  # via dagster
90
  frozenlist==1.4.1
91
  # via aiohttp
92
  # via aiosignal
93
+ fsspec==2024.6.1
94
  # via universal-pathlib
95
  gql==3.5.0
96
  # via dagster-graphql
 
104
  # via graphene
105
  greenlet==3.0.3
106
  # via sqlalchemy
107
+ grpcio==1.64.1
108
  # via dagster
109
  # via grpcio-health-checking
110
  grpcio-health-checking==1.62.2
 
117
  httptools==0.6.1
118
  # via uvicorn
119
  httpx==0.27.0
120
+ # via fastapi
121
  # via openai
122
  humanfriendly==10.0
123
  # via coloredlogs
124
  idna==3.7
125
  # via anyio
126
+ # via email-validator
127
  # via httpx
128
  # via requests
129
  # via yarl
130
  jinja2==3.1.4
131
  # via dagster
132
+ # via fastapi
133
  jsonpatch==1.33
134
  # via langchain-core
135
+ jsonpointer==3.0.0
136
  # via jsonpatch
137
+ langchain==0.2.12
138
  # via langchain-community
139
  # via semantic-catalogue
140
+ langchain-community==0.2.10
141
+ # via langchain-experimental
142
  # via semantic-catalogue
143
+ langchain-core==0.2.27
144
  # via langchain
145
  # via langchain-community
146
+ # via langchain-experimental
147
  # via langchain-openai
148
  # via langchain-pinecone
149
  # via langchain-text-splitters
150
+ # via langgraph
151
+ langchain-experimental==0.0.63
152
  # via semantic-catalogue
153
+ langchain-openai==0.1.20
154
  # via semantic-catalogue
155
+ langchain-pinecone==0.1.3
156
+ # via semantic-catalogue
157
+ langchain-text-splitters==0.2.2
158
  # via langchain
159
+ langchainhub==0.1.20
160
+ # via semantic-catalogue
161
+ langgraph==0.1.19
162
+ # via semantic-catalogue
163
+ langsmith==0.1.96
164
  # via langchain
165
  # via langchain-community
166
  # via langchain-core
167
+ lxml==5.2.2
168
+ # via semantic-catalogue
169
+ # via sickle
170
  mako==1.3.5
171
  # via alembic
172
  markdown-it-py==3.0.0
 
174
  markupsafe==2.1.5
175
  # via jinja2
176
  # via mako
177
+ marshmallow==3.21.3
178
  # via dataclasses-json
179
  mdurl==0.1.2
180
  # via markdown-it-py
 
187
  # via langchain
188
  # via langchain-community
189
  # via langchain-pinecone
190
+ openai==1.37.1
191
  # via dagster-openai
192
  # via langchain-openai
193
+ orjson==3.10.6
194
  # via langsmith
195
+ packaging==24.1
196
  # via dagster
197
  # via langchain-core
198
+ # via langchainhub
199
  # via marshmallow
200
+ pdfminer-six==20240706
201
+ # via semantic-catalogue
202
  pendulum==3.0.0
203
  # via dagster
204
+ pinecone-client==5.0.1
205
  # via langchain-pinecone
206
+ pinecone-plugin-inference==1.0.3
207
+ # via pinecone-client
208
+ pinecone-plugin-interface==0.0.7
209
+ # via pinecone-client
210
+ # via pinecone-plugin-inference
211
+ polars==1.3.0
212
  # via semantic-catalogue
213
+ protobuf==4.25.4
214
  # via dagster
215
  # via grpcio-health-checking
216
+ psycopg2-binary==2.9.9
217
+ # via dagster-postgres
218
+ pycparser==2.22
219
+ # via cffi
220
+ pydantic==2.8.2
221
  # via dagster
222
+ # via fastapi
223
  # via langchain
224
  # via langchain-core
225
  # via langsmith
226
  # via openai
227
+ # via pydantic-settings
228
+ pydantic-core==2.20.1
229
  # via pydantic
230
+ pydantic-settings==2.3.4
231
+ # via semantic-catalogue
232
  pygments==2.18.0
233
  # via rich
234
  python-dateutil==2.9.0.post0
235
  # via croniter
236
+ # via dateparser
237
  # via pendulum
238
  # via time-machine
239
  python-dotenv==1.0.1
240
  # via dagster
241
+ # via pydantic-settings
242
  # via semantic-catalogue
243
  # via uvicorn
244
+ python-multipart==0.0.9
245
+ # via fastapi
246
  pytz==2024.1
247
  # via croniter
248
  # via dagster
249
+ # via dateparser
250
  pyyaml==6.0.1
251
  # via dagster
252
  # via langchain
253
  # via langchain-community
254
  # via langchain-core
255
  # via uvicorn
256
+ regex==2024.7.24
257
+ # via dateparser
258
  # via tiktoken
259
  requests==2.32.3
260
  # via dagster
 
262
  # via gql
263
  # via langchain
264
  # via langchain-community
265
+ # via langchainhub
266
  # via langsmith
267
  # via requests-toolbelt
268
  # via semantic-catalogue
269
+ # via sickle
270
  # via tiktoken
271
  requests-toolbelt==1.0.0
272
  # via gql
273
  rich==13.7.1
274
  # via dagster
275
+ # via typer
276
+ setuptools==72.1.0
277
  # via dagster
278
+ shellingham==1.5.4
279
+ # via typer
280
+ sickle==0.7.0
281
+ # via semantic-catalogue
282
  six==1.16.0
283
  # via python-dateutil
284
  sniffio==1.3.1
285
  # via anyio
286
  # via httpx
287
  # via openai
288
+ sqlalchemy==2.0.31
289
  # via alembic
290
  # via dagster
291
  # via langchain
 
293
  starlette==0.37.2
294
  # via dagster-graphql
295
  # via dagster-webserver
296
+ # via fastapi
297
+ structlog==24.4.0
298
  # via dagster
299
  tabulate==0.9.0
300
  # via dagster
301
+ tenacity==8.5.0
302
  # via langchain
303
  # via langchain-community
304
  # via langchain-core
305
  tiktoken==0.7.0
306
  # via langchain-openai
307
+ time-machine==2.14.2
308
  # via pendulum
309
  tomli==2.0.1
310
  # via dagster
 
314
  # via dagster
315
  # via openai
316
  # via pinecone-client
317
+ typer==0.12.3
318
+ # via fastapi-cli
319
+ types-requests==2.32.0.20240712
320
+ # via langchainhub
321
+ typing-extensions==4.12.2
322
  # via alembic
323
  # via dagster
324
+ # via fastapi
325
+ # via langchain-core
326
  # via openai
327
  # via pinecone-client
328
  # via pydantic
329
  # via pydantic-core
330
  # via sqlalchemy
331
+ # via typer
332
  # via typing-inspect
333
  typing-inspect==0.9.0
334
  # via dataclasses-json
335
  tzdata==2024.1
336
  # via pendulum
337
+ tzlocal==5.2
338
+ # via dateparser
339
  universal-pathlib==0.2.2
340
  # via dagster
341
+ urllib3==2.2.2
342
  # via pinecone-client
343
  # via requests
344
+ # via types-requests
345
+ uvicorn==0.30.3
346
  # via dagster-webserver
347
+ # via fastapi
348
+ # via fastapi-cli
349
  uvloop==0.19.0
350
  # via uvicorn
351
  watchdog==4.0.1
src/__init__.py CHANGED
@@ -1,14 +0,0 @@
1
- from dagster import Definitions, load_assets_from_modules
2
-
3
- from src.assets import adr, datastore
4
- from src.jobs import adr_job
5
- from src.resources import openai_resource
6
-
7
- adr_assets = load_assets_from_modules(modules=[adr], group_name="adr_assets")
8
- datastore_assets = load_assets_from_modules(modules=[datastore], group_name="datastore")
9
-
10
- defs = Definitions(
11
- assets=[*adr_assets, *datastore_assets],
12
- jobs=[adr_job],
13
- resources={"openai": openai_resource},
14
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/assets/datastore.py DELETED
@@ -1,48 +0,0 @@
1
- from dagster import AssetExecutionContext, asset
2
- from dagster_openai import OpenAIResource
3
- from dotenv import load_dotenv
4
- from langchain_community.document_loaders import DirectoryLoader, TextLoader
5
- from langchain_openai import OpenAIEmbeddings
6
- from langchain_pinecone import PineconeVectorStore
7
- from langchain_text_splitters import CharacterTextSplitter
8
- from pinecone import Pinecone, ServerlessSpec
9
-
10
- from src.common.utils import Consts, Paths
11
-
12
- load_dotenv()
13
-
14
-
15
- @asset(compute_kind="Pinecone")
16
- def pinecone_index(context: AssetExecutionContext):
17
- pc = Pinecone()
18
- if Consts.INDEX_NAME in [index["name"] for index in pc.list_indexes()]:
19
- pc.delete_index(Consts.INDEX_NAME)
20
-
21
- pc.create_index(
22
- name=Consts.INDEX_NAME,
23
- dimension=Consts.EMBEDDING_DIM,
24
- spec=ServerlessSpec(cloud="aws", region="us-east-1"),
25
- metric="cosine",
26
- )
27
-
28
-
29
- @asset(compute_kind="OpenAI", deps=["adr_descriptions", "pinecone_index"])
30
- def adr_pinecone(context: AssetExecutionContext, openai: OpenAIResource):
31
- loader = DirectoryLoader(
32
- Paths.ADR / "descriptions",
33
- glob="*.txt",
34
- loader_cls=TextLoader,
35
- use_multithreading=True,
36
- show_progress=True,
37
- )
38
- documents = loader.load()
39
- text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
40
- docs = text_splitter.split_documents(documents)
41
-
42
- with openai.get_client(context) as client:
43
- embeddings = OpenAIEmbeddings(
44
- client=client.embeddings,
45
- model=Consts.EMBEDDING_MODEL,
46
- )
47
-
48
- PineconeVectorStore.from_documents(docs, embeddings, index_name=Consts.INDEX_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/common/logging.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import logging.config
2
+
3
+ logger = logging.getLogger("data-catalogue")
src/common/settings.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tomllib
2
+
3
+ from pydantic import Field
4
+ from pydantic_settings import BaseSettings
5
+
6
+ with open("./config/config.toml", "rb") as f:
7
+ Config = tomllib.load(f)
8
+
9
+
10
+ class DataStoreSettings(BaseSettings):
11
+ index_name: str = Field(min_length=1)
12
+ embed_model: str = Field(min_length=1)
13
+ embed_dim: int = Field(gt=0, le=10_000)
14
+ chunk_size: int = Field(gt=0, le=10_000)
15
+ chunk_overlap: int = Field(ge=0, le=10_000)
16
+
17
+
18
+ class ModelSettings(BaseSettings):
19
+ llm: str = Field(min_length=1)
20
+ top_k: int = Field(gt=0, le=100)
21
+
22
+
23
+ class Settings(BaseSettings):
24
+ model: ModelSettings = ModelSettings.model_validate(Config["model"])
25
+ datastore: DataStoreSettings = DataStoreSettings.model_validate(Config["datastore"])
26
+
27
+
28
+ cfg = Settings()
src/common/utils.py CHANGED
@@ -1,12 +1,21 @@
1
  from pathlib import Path
2
 
 
 
 
 
3
 
4
  class Paths:
5
- DATA = Path("data")
6
  ADR = DATA / "adr"
 
 
 
 
 
 
 
 
7
 
8
 
9
- class Consts:
10
- INDEX_NAME = "data-catalogue"
11
- EMBEDDING_MODEL = "text-embedding-3-large"
12
- EMBEDDING_DIM = 3072
 
1
  from pathlib import Path
2
 
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
 
8
  class Paths:
9
+ DATA: Path = Path("data")
10
  ADR = DATA / "adr"
11
+ UKDS = DATA / "ukds"
12
+ CDRC = DATA / "cdrc"
13
+
14
+ @classmethod
15
+ def ensure_directories_exist(cls):
16
+ cls.ADR.mkdir(parents=True, exist_ok=True)
17
+ cls.UKDS.mkdir(parents=True, exist_ok=True)
18
+ cls.CDRC.mkdir(parents=True, exist_ok=True)
19
 
20
 
21
+ Paths.ensure_directories_exist()
 
 
 
src/datastore/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dagster import Definitions, load_assets_from_modules
2
+
3
+ from src.datastore.assets import adr, cdrc, datastore, ukds
4
+ from src.datastore.jobs import adr_job, cdrc_job, ukds_job
5
+ from src.datastore.resources import openai_resource
6
+ from src.datastore.schedules import adr_schedule, cdrc_schedule, ukds_schedule
7
+
8
+ adr_assets = load_assets_from_modules(modules=[adr], group_name="adr_assets")
9
+ ukds_assets = load_assets_from_modules(modules=[ukds], group_name="ukds_assets")
10
+ cdrc_assets = load_assets_from_modules(modules=[cdrc], group_name="cdrc_assets")
11
+ datastore_assets = load_assets_from_modules(modules=[datastore], group_name="datastore")
12
+
13
+ defs = Definitions(
14
+ assets=[*ukds_assets, *adr_assets, *cdrc_assets, *datastore_assets],
15
+ jobs=[adr_job, ukds_job, cdrc_job],
16
+ schedules=[adr_schedule, ukds_schedule, cdrc_schedule],
17
+ resources={"openai": openai_resource},
18
+ )
src/{assets → datastore/assets}/adr.py RENAMED
@@ -1,10 +1,10 @@
1
  import itertools
2
  import json
3
- import logging
4
 
5
  import polars as pl
6
  import requests
7
  from dagster import AssetExecutionContext, asset
 
8
  from tqdm import tqdm
9
 
10
  from src.common.utils import Paths
@@ -15,7 +15,7 @@ BASE_URL = "https://api-datacatalogue.adruk.org/api"
15
 
16
 
17
  @asset
18
- def adr_session():
19
  session = requests.Session()
20
  session.headers.update({"X-API-Version": API_VERSION})
21
  return session
@@ -28,16 +28,26 @@ def adr_datasets_id(
28
  datasets = []
29
  for page_number in itertools.count(start=1):
30
  context.log.info(f"Fetching page {page_number}")
31
- datasets_page = _fetch_datasets_page(adr_session, page_number)
32
- if not datasets_page:
 
33
  break
34
  datasets.extend(datasets_page)
35
- df = pl.DataFrame(datasets)
 
 
 
 
 
36
  df.write_parquet(Paths.ADR / "adr_datasets_id.parquet")
 
37
  return df
38
 
39
 
40
- def _fetch_datasets_page(adr_session: requests.Session, page_number: int) -> dict:
 
 
 
41
  params = {
42
  "pageSize": PAGE_SIZE,
43
  "pageNumber": page_number,
@@ -50,11 +60,16 @@ def _fetch_datasets_page(adr_session: requests.Session, page_number: int) -> dic
50
  try:
51
  response = adr_session.get(f"{BASE_URL}/{{sql}}/dataset", params=params)
52
  response.raise_for_status()
53
- return json.loads(response.content)["content"]
 
 
 
54
  except requests.HTTPError as http_err:
55
- logging.error(f"HTTP error occurred: {http_err}")
 
56
  except Exception as err:
57
- logging.error(f"Other error occurred: {err}")
 
58
 
59
 
60
  @asset
@@ -63,19 +78,26 @@ def adr_datasets(
63
  adr_session: requests.Session,
64
  adr_datasets_id: pl.DataFrame,
65
  ) -> pl.DataFrame:
66
- df = adr_datasets_id.filter(pl.col("searchResultType") == "PHYSICAL").with_columns(
67
- pl.col("origin").struct[0].alias("origin_id")
68
- )
69
 
70
  datasets_list = []
71
- for row in tqdm(df.rows(named=True), total=len(df)):
72
  dataset = _fetch_dataset_info(context, adr_session, row)
73
- datasets_list.append(dataset)
 
74
  df = pl.DataFrame(datasets_list)
75
- df.write_parquet(Paths.ADR / "adr_datasets.parquet")
 
 
 
 
 
 
 
 
76
  return df
77
 
78
 
 
79
  def _fetch_dataset_info(
80
  context: AssetExecutionContext, adr_session: requests.Session, row: dict
81
  ) -> dict:
@@ -85,8 +107,10 @@ def _fetch_dataset_info(
85
  response.raise_for_status()
86
  except requests.HTTPError as http_err:
87
  context.log.error(f"HTTP error occurred: {http_err}")
 
88
  except Exception as err:
89
  context.log.error(f"Other error occurred: {err}")
 
90
  content = json.loads(response.content)
91
 
92
  return {
@@ -105,10 +129,12 @@ def _fetch_dataset_info(
105
 
106
  @asset
107
  def adr_descriptions(adr_datasets: pl.DataFrame) -> None:
 
 
 
108
  for item in adr_datasets.rows(named=True):
109
  with open(
110
- Paths.ADR
111
- / f"descriptions/{item['id']}-{item['origin_id']}-description.txt",
112
  "w",
113
  ) as f:
114
  f.write(
 
1
  import itertools
2
  import json
 
3
 
4
  import polars as pl
5
  import requests
6
  from dagster import AssetExecutionContext, asset
7
+ from tenacity import retry, stop_after_attempt, wait_exponential
8
  from tqdm import tqdm
9
 
10
  from src.common.utils import Paths
 
15
 
16
 
17
  @asset
18
+ def adr_session() -> requests.Session:
19
  session = requests.Session()
20
  session.headers.update({"X-API-Version": API_VERSION})
21
  return session
 
28
  datasets = []
29
  for page_number in itertools.count(start=1):
30
  context.log.info(f"Fetching page {page_number}")
31
+ datasets_page = _fetch_datasets_page(context, adr_session, page_number)
32
+ if "end" in datasets_page:
33
+ context.log.info(f"End of pages reached at {datasets_page['end']}")
34
  break
35
  datasets.extend(datasets_page)
36
+ df = (
37
+ pl.DataFrame(datasets)
38
+ .select(["origin", "id", "searchResultType", "title"])
39
+ .filter(pl.col("searchResultType") == "PHYSICAL")
40
+ .with_columns(pl.col("origin").struct[0].alias("origin_id"))
41
+ )
42
  df.write_parquet(Paths.ADR / "adr_datasets_id.parquet")
43
+
44
  return df
45
 
46
 
47
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
48
+ def _fetch_datasets_page(
49
+ context: AssetExecutionContext, adr_session: requests.Session, page_number: int
50
+ ) -> dict:
51
  params = {
52
  "pageSize": PAGE_SIZE,
53
  "pageNumber": page_number,
 
60
  try:
61
  response = adr_session.get(f"{BASE_URL}/{{sql}}/dataset", params=params)
62
  response.raise_for_status()
63
+ content = json.loads(response.content)["content"]
64
+ if not content:
65
+ return {"end": page_number}
66
+ return content
67
  except requests.HTTPError as http_err:
68
+ context.log.error(f"HTTP error occurred: {http_err}")
69
+ return {}
70
  except Exception as err:
71
+ context.log.error(f"Other error occurred: {err}")
72
+ return {}
73
 
74
 
75
  @asset
 
78
  adr_session: requests.Session,
79
  adr_datasets_id: pl.DataFrame,
80
  ) -> pl.DataFrame:
 
 
 
81
 
82
  datasets_list = []
83
+ for row in tqdm(adr_datasets_id.rows(named=True), total=len(adr_datasets_id)):
84
  dataset = _fetch_dataset_info(context, adr_session, row)
85
+ if dataset:
86
+ datasets_list.append(dataset)
87
  df = pl.DataFrame(datasets_list)
88
+ (
89
+ df.with_columns(
90
+ pl.col("coverage").struct[0].alias("coverage_0"),
91
+ pl.col("coverage").struct[1].alias("coverage_1"),
92
+ )
93
+ .drop("coverage")
94
+ .write_parquet(Paths.ADR / "adr_datasets.parquet")
95
+ )
96
+
97
  return df
98
 
99
 
100
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
101
  def _fetch_dataset_info(
102
  context: AssetExecutionContext, adr_session: requests.Session, row: dict
103
  ) -> dict:
 
107
  response.raise_for_status()
108
  except requests.HTTPError as http_err:
109
  context.log.error(f"HTTP error occurred: {http_err}")
110
+ return {}
111
  except Exception as err:
112
  context.log.error(f"Other error occurred: {err}")
113
+ return {}
114
  content = json.loads(response.content)
115
 
116
  return {
 
129
 
130
  @asset
131
  def adr_descriptions(adr_datasets: pl.DataFrame) -> None:
132
+ outdir = Paths.ADR / "txt"
133
+ outdir.mkdir(parents=True, exist_ok=True)
134
+
135
  for item in adr_datasets.rows(named=True):
136
  with open(
137
+ outdir / f"{item['id']}-{item['origin_id']}-description.txt",
 
138
  "w",
139
  ) as f:
140
  f.write(
src/datastore/assets/cdrc.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import os
3
+ import re
4
+
5
+ import polars as pl
6
+ import requests
7
+ from dagster import AssetExecutionContext, asset
8
+ from dotenv import load_dotenv
9
+ from tqdm import tqdm
10
+
11
+ from src.common.utils import Paths
12
+
13
+ load_dotenv()
14
+
15
+ METADATA_URL = (
16
+ "https://data.cdrc.ac.uk/api/3/action/current_package_list_with_resources"
17
+ )
18
+ LOGIN_URL = "https://data.cdrc.ac.uk/user/login"
19
+
20
+
21
+ @asset
22
+ def cdrc_metadata(context: AssetExecutionContext) -> list[dict]:
23
+ try:
24
+ r = requests.get(METADATA_URL)
25
+ r.raise_for_status()
26
+ except requests.HTTPError as http_err:
27
+ context.log.error(f"HTTP error occurred: {http_err}")
28
+ raise
29
+ except Exception as err:
30
+ context.log.error(f"Other error occurred: {err}")
31
+ raise
32
+ catalogue_metadata = r.json()["result"][0]
33
+ return catalogue_metadata
34
+
35
+
36
+ @asset
37
+ def cdrc_notes(cdrc_metadata: list[dict]):
38
+ outdir = Paths.CDRC / "txt"
39
+ outdir.mkdir(parents=True, exist_ok=True)
40
+
41
+ df = pl.DataFrame(cdrc_metadata).drop(["resources", "tags", "extras"])
42
+ df.write_parquet(Paths.CDRC / "cdrc_metadata.parquet")
43
+
44
+ for item in df.rows(named=True):
45
+ with open(Paths.CDRC / "txt" / f"{item['id']}-notes.txt", "w") as f:
46
+ f.write(
47
+ f"Dataset Title: {item['title']}"
48
+ "\n\nDescription: \n\n"
49
+ f"{re.sub('<[^<]+?>','', item['notes'])}"
50
+ )
51
+
52
+
53
+ @asset
54
+ def cdrc_resources(cdrc_metadata: list[dict]) -> pl.DataFrame:
55
+ resources = list(
56
+ itertools.chain.from_iterable(
57
+ [item["resources"] if "resources" in item else [] for item in cdrc_metadata]
58
+ )
59
+ )
60
+ df = pl.concat(
61
+ [
62
+ pl.DataFrame(cdrc_metadata).explode("resources").drop("resources"),
63
+ pl.DataFrame(resources)
64
+ .rename(
65
+ {"id": "resource_id", "url": "resource_url", "name": "resource_name"},
66
+ )
67
+ .drop(["state", "revision_timestamp"]),
68
+ ],
69
+ how="horizontal",
70
+ ).filter((pl.col("format") == "pdf") & (pl.col("resource_url") != ""))
71
+ df.write_parquet(Paths.CDRC / "cdrc_resource_metadata.parquet")
72
+ return df
73
+
74
+
75
+ @asset
76
+ def cdrc_session() -> requests.Session:
77
+ session = requests.Session()
78
+ session.post(
79
+ LOGIN_URL,
80
+ data={
81
+ "name": os.getenv("CDRC_USERNAME"),
82
+ "pass": os.getenv("CDRC_PASSWORD"),
83
+ "form_build_id": os.getenv("CDRC_FORM_BUILD_ID"),
84
+ "form_id": "user_login",
85
+ "op": "Log in",
86
+ },
87
+ )
88
+ return session
89
+
90
+
91
+ @asset
92
+ def cdrc_pdfs(
93
+ context: AssetExecutionContext,
94
+ cdrc_session: requests.Session,
95
+ cdrc_resources: pl.DataFrame,
96
+ ):
97
+ outdir = Paths.CDRC / "pdf"
98
+ outdir.mkdir(parents=True, exist_ok=True)
99
+
100
+ for item in tqdm(cdrc_resources.rows(named=True)):
101
+ context.log.info(f"Processing {item['resource_url']}...")
102
+ filepath = outdir / f"{item['id']}-{item['resource_id']}.pdf"
103
+ try:
104
+ file = cdrc_session.get(item["resource_url"])
105
+ file.raise_for_status()
106
+ except requests.HTTPError as http_err:
107
+ context.log.error(f"HTTP error occurred: {http_err}")
108
+ continue
109
+ except Exception as err:
110
+ context.log.error(f"Other error occurred: {err}")
111
+ continue
112
+ with open(filepath, "wb") as f:
113
+ f.write(file.content)
src/datastore/assets/datastore.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from pathlib import Path
3
+
4
+ from dagster import (
5
+ AssetExecutionContext,
6
+ AutoMaterializePolicy,
7
+ AutoMaterializeRule,
8
+ asset,
9
+ )
10
+ from dagster_openai import OpenAIResource
11
+ from langchain_community.document_loaders import DirectoryLoader
12
+ from langchain_experimental.text_splitter import SemanticChunker
13
+ from langchain_openai import OpenAIEmbeddings
14
+ from langchain_pinecone import PineconeVectorStore
15
+ from pinecone import Pinecone, ServerlessSpec
16
+
17
+ from src.common.settings import cfg
18
+ from src.common.utils import Paths
19
+ from src.datastore.loaders import ADRLoader, CDRCLoader, UKDSLoader
20
+
21
+ wait_on_all_parents_policy = AutoMaterializePolicy.eager().with_rules(
22
+ AutoMaterializeRule.skip_on_not_all_parents_updated()
23
+ )
24
+
25
+
26
+ def _process_documents(
27
+ context: AssetExecutionContext,
28
+ openai: OpenAIResource,
29
+ paths: list[Path],
30
+ glob_patterns: list[str],
31
+ loader_classes: list[type],
32
+ ):
33
+ documents = []
34
+ for path, glob_pattern, loader_cls in zip(paths, glob_patterns, loader_classes):
35
+ loader = DirectoryLoader(
36
+ str(path),
37
+ glob=glob_pattern,
38
+ loader_cls=loader_cls,
39
+ use_multithreading=True,
40
+ show_progress=True,
41
+ )
42
+ documents.extend(loader.load())
43
+
44
+ with openai.get_client(context) as client:
45
+ embeddings = OpenAIEmbeddings(
46
+ client=client.embeddings, model=cfg.datastore.embed_model
47
+ )
48
+
49
+ text_splitter = SemanticChunker(
50
+ embeddings=embeddings, breakpoint_threshold_type="percentile"
51
+ )
52
+ docs = text_splitter.split_documents(documents)
53
+
54
+ vectorstore = PineconeVectorStore(
55
+ index_name=cfg.datastore.index_name, embedding=embeddings
56
+ )
57
+ vectorstore.add_documents(documents=docs)
58
+
59
+
60
+ @asset(
61
+ compute_kind="Pinecone",
62
+ deps=["adr_descriptions", "ukds_abstracts", "cdrc_notes", "cdrc_pdfs"],
63
+ auto_materialize_policy=wait_on_all_parents_policy,
64
+ )
65
+ def pinecone_index(context: AssetExecutionContext, openai: OpenAIResource):
66
+ pc = Pinecone()
67
+ if cfg.datastore.index_name in [index["name"] for index in pc.list_indexes()]:
68
+ pc.delete_index(cfg.datastore.index_name)
69
+
70
+ pc.create_index(
71
+ name=cfg.datastore.index_name,
72
+ dimension=cfg.datastore.embed_dim,
73
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
74
+ metric="cosine",
75
+ )
76
+ while not pc.describe_index(cfg.datastore.index_name).status["ready"]:
77
+ time.sleep(1)
78
+
79
+ _process_documents(
80
+ context,
81
+ openai,
82
+ paths=[Paths.ADR / "txt"],
83
+ glob_patterns=["*.txt"],
84
+ loader_classes=[ADRLoader],
85
+ )
86
+ _process_documents(
87
+ context,
88
+ openai,
89
+ paths=[Paths.CDRC / "txt", Paths.CDRC / "pdf"],
90
+ glob_patterns=["*.txt", "*.pdf"],
91
+ loader_classes=[CDRCLoader, CDRCLoader],
92
+ )
93
+ _process_documents(
94
+ context,
95
+ openai,
96
+ paths=[Paths.UKDS / "txt"],
97
+ loader_classes=[UKDSLoader],
98
+ glob_patterns=["*.txt"],
99
+ )
src/datastore/assets/ukds.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import xml.etree.ElementTree as ET
3
+
4
+ import polars as pl
5
+ import requests
6
+ from dagster import AssetExecutionContext, asset
7
+ from tenacity import retry, stop_after_attempt, wait_exponential
8
+ from tqdm import tqdm
9
+
10
+ from src.common.utils import Paths
11
+
12
+ BASE_URL = "https://oai.ukdataservice.ac.uk:8443/oai/provider"
13
+ PARAMS = {"verb": "ListIdentifiers", "metadataPrefix": "ddi", "set": "DataCollections"}
14
+ NAMESPACES = {"oai": "http://www.openarchives.org/OAI/2.0/", "ns2": "ddi:codebook:2_5"}
15
+
16
+
17
+ @asset
18
+ def ukds_identifiers() -> list[str]:
19
+ params = PARAMS.copy()
20
+
21
+ identifiers = []
22
+ while True:
23
+ response = requests.get(BASE_URL, params=params)
24
+ if response.status_code != 200:
25
+ print(f"Failed to fetch data. Status code: {response.status_code}")
26
+ break
27
+
28
+ root = ET.fromstring(response.content)
29
+
30
+ headers = root.findall(".//oai:header", NAMESPACES)
31
+ for header in headers:
32
+ if header.attrib.get("status") != "deleted":
33
+ identifier = header.find(".//oai:identifier", NAMESPACES)
34
+ if identifier is not None:
35
+ identifiers.append(identifier.text)
36
+
37
+ token_elem = root.find(".//oai:resumptionToken", NAMESPACES)
38
+ if token_elem is None or token_elem.text is None:
39
+ break
40
+
41
+ params = {"verb": "ListIdentifiers", "resumptionToken": token_elem.text}
42
+ return identifiers
43
+
44
+
45
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
46
+ def _fetch_metadata(context: AssetExecutionContext, identifier: int):
47
+ metadata_url = (
48
+ f"{BASE_URL}?verb=GetRecord&identifier={identifier}&metadataPrefix=ddi"
49
+ )
50
+ try:
51
+ response = requests.get(metadata_url)
52
+ response.raise_for_status()
53
+ except requests.HTTPError as http_err:
54
+ context.log.error(f"HTTP error occurred: {http_err}")
55
+ raise
56
+ except Exception as err:
57
+ context.log.error(f"Other error occurred: {err}")
58
+ raise
59
+
60
+ root = ET.fromstring(response.content)
61
+ return root
62
+
63
+
64
+ @asset
65
+ def ukds_datasets(
66
+ context: AssetExecutionContext, ukds_identifiers: list[str]
67
+ ) -> pl.DataFrame:
68
+ data = []
69
+ for identifier in tqdm(ukds_identifiers):
70
+ context.log.info(f"Fetching identifier {identifier}")
71
+ metadata = _fetch_metadata(context, identifier).find(
72
+ ".//ns2:stdyDscr", NAMESPACES
73
+ )
74
+
75
+ if metadata is None:
76
+ continue
77
+
78
+ abstract = "\n".join(
79
+ [
80
+ re.sub("<[^<]+?>", "", m.text)
81
+ for m in metadata.findall(".//ns2:abstract", NAMESPACES)
82
+ if m.text is not None
83
+ ]
84
+ )
85
+ date = metadata.find(".//ns2:depDate", NAMESPACES)
86
+ title = metadata.find(".//ns2:titl", NAMESPACES)
87
+ keywords = [
88
+ m.text
89
+ for m in metadata.findall(".//ns2:keyword", NAMESPACES)
90
+ if m.text is not None
91
+ ]
92
+ doi = metadata.find(".//ns2:holdings", NAMESPACES)
93
+ url = f"https://beta.ukdataservice.ac.uk/datacatalogue/studies/study?id={identifier}"
94
+ data.append(
95
+ {
96
+ "title": title.text if title is not None else None,
97
+ "abstract": abstract,
98
+ "date": date.text if date is not None else None,
99
+ "keywords": keywords,
100
+ "doi": doi.get("URI") if doi is not None else None,
101
+ "url": url,
102
+ }
103
+ )
104
+
105
+ df = pl.DataFrame(data)
106
+ df.write_parquet(Paths.UKDS / "ukds.parquet")
107
+ return df
108
+
109
+
110
+ @asset
111
+ def ukds_abstracts(ukds_datasets: pl.DataFrame):
112
+ outdir = Paths.UKDS / "txt"
113
+ outdir.mkdir(parents=True, exist_ok=True)
114
+
115
+ for row in ukds_datasets.rows(named=True):
116
+ id = row["url"].split("=")[-1]
117
+ abstract = row["abstract"].replace(
118
+ "Abstract copyright UK Data Service and data collection copyright owner.",
119
+ "",
120
+ )
121
+ with open(outdir / f"{id}-abstract.txt", "w") as f:
122
+ f.write(f"Dataset Title: {row['title']}" f"\n\nDescription: \n\n{abstract}")
src/{jobs.py → datastore/jobs.py} RENAMED
@@ -4,4 +4,17 @@ adr_job = define_asset_job(
4
  "adr",
5
  selection=["adr_session", "adr_datasets_id", "adr_datasets", "adr_descriptions"],
6
  )
7
- pinecone_job = define_asset_job("pinecone", selection=["adr_pinecone"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  "adr",
5
  selection=["adr_session", "adr_datasets_id", "adr_datasets", "adr_descriptions"],
6
  )
7
+ ukds_job = define_asset_job(
8
+ "ukds",
9
+ selection=["ukds_identifiers", "ukds_datasets", "ukds_abstracts"],
10
+ )
11
+ cdrc_job = define_asset_job(
12
+ "cdrc",
13
+ selection=[
14
+ "cdrc_session",
15
+ "cdrc_metadata",
16
+ "cdrc_notes",
17
+ "cdrc_resources",
18
+ "cdrc_pdfs",
19
+ ],
20
+ )
src/datastore/loaders.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Iterator
3
+
4
+ import dateparser
5
+ import polars as pl
6
+ from langchain_community.document_loaders import PDFMinerLoader
7
+ from langchain_core.document_loaders import BaseLoader
8
+ from langchain_core.documents import Document
9
+
10
+ from src.common.utils import Paths
11
+
12
+
13
+ class CDRCLoader(BaseLoader):
14
+ def __init__(self, file_path: str) -> None:
15
+ self.file_path = file_path
16
+
17
+ def lazy_load(self) -> Iterator[Document]:
18
+ if self.file_path.endswith(".pdf"):
19
+ document = PDFMinerLoader(self.file_path).load()
20
+ metadata = self._add_cdrc_pdf_metadata(self.file_path)
21
+ document[0].metadata |= metadata
22
+ yield document[0]
23
+ elif self.file_path.endswith(".txt"):
24
+ with open(self.file_path, encoding="utf-8") as f:
25
+ yield Document(
26
+ page_content=f.read(),
27
+ metadata={"source": self.file_path}
28
+ | self._add_cdrc_txt_metadata(self.file_path),
29
+ )
30
+
31
+ @staticmethod
32
+ def _add_cdrc_txt_metadata(file_path: str) -> dict[str, str]:
33
+ id = Path(file_path).stem.rsplit("-", maxsplit=1)[0]
34
+ cdrc_meta = pl.read_parquet(Paths.CDRC / "cdrc_metadata.parquet")
35
+
36
+ metadata = cdrc_meta.filter(pl.col("id") == id)
37
+ iso_date = dateparser.parse(metadata["metadata_created"][0]).isoformat() # type: ignore
38
+ return {
39
+ "title": metadata["title"][0],
40
+ "id": metadata["id"][0],
41
+ "url": metadata["url"][0],
42
+ "date_created": iso_date,
43
+ "source": "CDRC",
44
+ }
45
+
46
+ @staticmethod
47
+ def _add_cdrc_pdf_metadata(file_path: str) -> dict[str, str]:
48
+ id = Path(file_path).stem
49
+ main_id = "-".join(id.split("-")[:5])
50
+ resource_id = "-".join(id.split("-")[5:])
51
+
52
+ cdrc_meta = pl.read_parquet(Paths.CDRC / "cdrc_metadata.parquet")
53
+ cdrc_pdf_meta = pl.read_parquet(Paths.CDRC / "cdrc_resource_metadata.parquet")
54
+
55
+ resource = cdrc_pdf_meta.filter(pl.col("resource_id") == resource_id)
56
+ metadata = cdrc_meta.filter(pl.col("id") == main_id)
57
+
58
+ iso_date = dateparser.parse(resource["created"][0]).isoformat() # type: ignore
59
+ return {
60
+ "title": metadata["title"][0],
61
+ "id": metadata["id"][0],
62
+ "url": metadata["url"][0],
63
+ "date_created": iso_date,
64
+ "source": "CDRC",
65
+ }
66
+
67
+
68
+ class ADRLoader(BaseLoader):
69
+ def __init__(self, file_path: str) -> None:
70
+ self.file_path = file_path
71
+
72
+ def lazy_load(self) -> Iterator[Document]:
73
+ with open(self.file_path, encoding="utf-8") as f:
74
+ yield Document(
75
+ page_content=f.read(),
76
+ metadata={"source": self.file_path}
77
+ | self._add_adr_metadata(self.file_path),
78
+ )
79
+
80
+ @staticmethod
81
+ def _add_adr_metadata(file_path: str) -> dict[str, str]:
82
+ doc_id, origin_id, _ = Path(file_path).stem.split("-")
83
+ metadata = (
84
+ pl.scan_parquet(Paths.ADR / "adr_datasets.parquet")
85
+ .filter((pl.col("id") == doc_id) & (pl.col("origin_id") == origin_id))
86
+ .collect()[0]
87
+ .to_dict(as_series=False)
88
+ )
89
+ if len(metadata["id"]) == 0:
90
+ return {
91
+ "title": "",
92
+ "id": f"{doc_id}-{origin_id}",
93
+ "url": "",
94
+ "date_created": "",
95
+ "source": "ADR",
96
+ }
97
+
98
+ date_created = metadata["coverage_1"][0]["distributionReleaseDate"]
99
+ date_created = (
100
+ dateparser.parse(date_created).isoformat() # type: ignore
101
+ if isinstance(date_created, str)
102
+ else ""
103
+ )
104
+ return {
105
+ "title": metadata["name"][0],
106
+ "id": f"{doc_id}-{origin_id}",
107
+ "url": metadata["url"][0],
108
+ "date_created": date_created,
109
+ "source": "ADR",
110
+ }
111
+
112
+
113
+ class UKDSLoader(BaseLoader):
114
+ def __init__(self, file_path: str) -> None:
115
+ self.file_path = file_path
116
+
117
+ def lazy_load(self) -> Iterator[Document]:
118
+ with open(self.file_path, encoding="utf-8") as f:
119
+ yield Document(
120
+ page_content=f.read(),
121
+ metadata={"source": self.file_path}
122
+ | self._add_ukds_metadata(self.file_path),
123
+ )
124
+
125
+ @staticmethod
126
+ def _add_ukds_metadata(file_path: str) -> dict[str, str]:
127
+ doc_id = Path(file_path).stem.split("-")[0]
128
+ metadata = (
129
+ pl.scan_parquet(Paths.UKDS / "ukds.parquet")
130
+ .with_columns(pl.col("url").str.split("=").list[1].alias("id"))
131
+ .filter(pl.col("id") == doc_id)
132
+ .collect()
133
+ .to_dict(as_series=False)
134
+ )
135
+ if len(metadata["id"]) == 0:
136
+ return {
137
+ "title": "",
138
+ "id": doc_id,
139
+ "url": "",
140
+ "date_created": "",
141
+ "source": "UKDS",
142
+ }
143
+ date_created = (
144
+ dateparser.parse(metadata["date"][0]).isoformat() # type: ignore
145
+ if isinstance(metadata["date"][0], str)
146
+ else ""
147
+ )
148
+ return {
149
+ "title": metadata["title"][0],
150
+ "id": doc_id,
151
+ "url": metadata["url"][0],
152
+ "date_created": date_created,
153
+ "source": "UKDS",
154
+ }
src/{resources.py → datastore/resources.py} RENAMED
File without changes
src/datastore/schedules.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dagster import DefaultScheduleStatus, ScheduleDefinition
2
+
3
+ from src.datastore.jobs import adr_job, cdrc_job, ukds_job
4
+
5
+ adr_schedule = ScheduleDefinition(
6
+ job=adr_job,
7
+ cron_schedule="0 0 * * 0",
8
+ name="adr_weekly_schedule",
9
+ default_status=DefaultScheduleStatus.RUNNING,
10
+ )
11
+ cdrc_schedule = ScheduleDefinition(
12
+ job=cdrc_job,
13
+ cron_schedule="0 0 * * 0",
14
+ name="cdrc_weekly_schedule",
15
+ default_status=DefaultScheduleStatus.RUNNING,
16
+ )
17
+ ukds_schedule = ScheduleDefinition(
18
+ job=ukds_job,
19
+ cron_schedule="0 0 * * 0",
20
+ name="ukds_weekly_schedule",
21
+ default_status=DefaultScheduleStatus.RUNNING,
22
+ )
src/model/__init__.py ADDED
File without changes
src/model/answer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from langchain_openai import ChatOpenAI
5
+
6
+ from src.common.settings import cfg
7
+ from src.model.grader import structured_llm_grader
8
+
9
+ _ = load_dotenv()
10
+
11
+
12
+ class GradeAnswer(BaseModel):
13
+ """Binary score to assess answer addresses question."""
14
+
15
+ binary_score: str = Field(
16
+ description="Answer addresses the question, 'yes' or 'no'"
17
+ )
18
+
19
+
20
+ llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
21
+ structured_llm_grader = llm.with_structured_output(GradeAnswer)
22
+
23
+ # Prompt
24
+ system = """You are a grader assessing whether an answer addresses / resolves a question \n
25
+ Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
26
+ answer_prompt = ChatPromptTemplate.from_messages(
27
+ [
28
+ ("system", system),
29
+ ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
30
+ ]
31
+ )
32
+ answer_grader = answer_prompt | structured_llm_grader
src/model/dag.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @asset(compute_kind="OpenAI", deps=["pinecone_index"])
2
+ def search(context: AssetExecutionContext, openai: OpenAIResource):
3
+ with openai.get_client(context) as client:
4
+ embeddings = OpenAIEmbeddings(
5
+ client=client.embeddings, model=cfg.datastore.embed_model
6
+ )
7
+ vectorstore = PineconeVectorStore(
8
+ index_name=cfg.datastore.index_name, embedding=embeddings
9
+ )
10
+
11
+ # retriever = vectorstore.as_retriever()
12
+
13
+
14
+ from typing import Any, Dict
15
+
16
+ import requests
17
+ from fastapi import FastAPI, HTTPException
18
+ from pydantic import BaseModel
19
+
20
+ app = FastAPI()
21
+
22
+
23
+ class RunConfig(BaseModel):
24
+ run_config: Dict[str, Any]
25
+
26
+
27
+ def launch_dagster_run(pipeline_name: str, run_config: dict, mode: str = "default"):
28
+ url = "http://<your-dagster-instance>/graphql"
29
+ headers = {
30
+ "Content-Type": "application/json",
31
+ }
32
+ query = """
33
+ mutation($pipelineName: String!, $runConfigData: RunConfigData, $mode: String!) {
34
+ launchPipelineExecution(
35
+ executionParams: {
36
+ selector: {
37
+ pipelineName: $pipelineName
38
+ },
39
+ runConfigData: $runConfigData,
40
+ mode: $mode
41
+ }
42
+ ) {
43
+ __typename
44
+ ... on LaunchPipelineRunSuccess {
45
+ run {
46
+ runId
47
+ }
48
+ }
49
+ ... on PythonError {
50
+ message
51
+ stack
52
+ }
53
+ }
54
+ }
55
+ """
56
+ variables = {
57
+ "pipelineName": pipeline_name,
58
+ "runConfigData": run_config,
59
+ "mode": mode,
60
+ }
61
+ response = requests.post(
62
+ url, json={"query": query, "variables": variables}, headers=headers
63
+ )
64
+ return response.json()
65
+
66
+
67
+ @app.post("/trigger-asset")
68
+ def trigger_asset(pipeline_name: str, run_config: RunConfig, mode: str = "default"):
69
+ try:
70
+ result = launch_dagster_run(pipeline_name, run_config.run_config, mode)
71
+ return result
72
+ except Exception as e:
73
+ raise HTTPException(status_code=500, detail=str(e))
74
+
75
+
76
+ if __name__ == "__main__":
77
+ import uvicorn
78
+
79
+ uvicorn.run(app, host="0.0.0.0", port=8000)
src/model/grader.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from langchain_openai import ChatOpenAI
5
+
6
+ from src.common.settings import cfg
7
+
8
+ _ = load_dotenv()
9
+
10
+
11
+ class GradeDocuments(BaseModel):
12
+ """Binary score for relevance check on retrieved documents."""
13
+
14
+ binary_score: str = Field(
15
+ description="Documents are relevant to the query, 'yes' or 'no'"
16
+ )
17
+
18
+
19
+ llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
20
+ structured_llm_grader = llm.with_structured_output(GradeDocuments)
21
+ system = """
22
+ You are a grader assessing relevance of a retrieved document to a user query. \n
23
+ It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
24
+ If the document contains keyword(s) or semantic meaning related to the user query, grade it as relevant. \n
25
+ Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the query.
26
+ """
27
+ grade_prompt = ChatPromptTemplate.from_messages(
28
+ [
29
+ ("system", system),
30
+ ("human", "Retrieved document: \n\n {document} \n\n User query: {query}"),
31
+ ]
32
+ )
33
+
34
+ retrieval_grader = grade_prompt | structured_llm_grader
src/model/hallucination.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from langchain_openai import ChatOpenAI
5
+
6
+ from src.common.settings import cfg
7
+ from src.model.grader import structured_llm_grader
8
+
9
+ _ = load_dotenv()
10
+
11
+
12
+ class GradeHallucinations(BaseModel):
13
+ """Binary score for hallucination present in generation answer."""
14
+
15
+ binary_score: str = Field(
16
+ description="Answer is grounded in the facts, 'yes' or 'no'"
17
+ )
18
+
19
+
20
+ llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
21
+ structured_llm_grader = llm.with_structured_output(GradeHallucinations)
22
+
23
+ system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
24
+ Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
25
+ hallucination_prompt = ChatPromptTemplate.from_messages(
26
+ [
27
+ ("system", system),
28
+ ("human", "Set of facts: \n\n {document} \n\n LLM generation: {generation}"),
29
+ ]
30
+ )
31
+
32
+ hallucination_grader = hallucination_prompt | structured_llm_grader
src/model/model.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypedDict
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain_core.documents import Document
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain_pinecone import PineconeVectorStore
7
+ from langgraph.graph import END, START, StateGraph
8
+
9
+ from src.common.settings import cfg
10
+ from src.model.grader import retrieval_grader
11
+ from src.model.hallucination import hallucination_grader
12
+ from src.model.rag import rag_chain
13
+
14
+ _ = load_dotenv()
15
+
16
+
17
+ class SearchState(TypedDict):
18
+ query: str
19
+ documents: list[str]
20
+
21
+
22
+ class GenerationState(TypedDict):
23
+ query: str
24
+ document: str
25
+ generation: str
26
+
27
+
28
+ def _group_by_document(documents):
29
+ grouped_id: dict[str, list[tuple[Document, float]]] = {}
30
+
31
+ for node, score in documents:
32
+ id = node.metadata["id"]
33
+ if id not in grouped_id:
34
+ grouped_id[id] = []
35
+ grouped_id[id].append((node, score))
36
+
37
+ out_nodes = []
38
+ for group in grouped_id.values():
39
+ nodes = [n[0] for n in group]
40
+ scores = [n[1] for n in group]
41
+ content = "\n--------------------\n".join([n.page_content for n in nodes])
42
+ document = Document(
43
+ page_content=content, metadata=nodes[0].metadata | {"score": max(scores)}
44
+ )
45
+ out_nodes.append(document)
46
+ return out_nodes
47
+
48
+
49
+ def retrieve(state):
50
+ print("---RETRIEVE---")
51
+ embeddings = OpenAIEmbeddings(model=cfg.datastore.embed_model)
52
+ vectorstore = PineconeVectorStore(
53
+ index_name=cfg.datastore.index_name,
54
+ embedding=embeddings,
55
+ )
56
+ query = state["query"]
57
+ documents = vectorstore.similarity_search_with_score(query=query, k=cfg.model.top_k)
58
+ documents = _group_by_document(documents)
59
+ return {"documents": documents, "query": query}
60
+
61
+
62
+ def grade_documents(state):
63
+ print("---CHECK DOCUMENT RELEVANCE TO QUERY---")
64
+ query = state["query"]
65
+ documents = state["documents"]
66
+
67
+ filtered_docs = []
68
+ for d in documents:
69
+ score = retrieval_grader.invoke({"query": query, "document": d.page_content})
70
+ grade = score.binary_score # type: ignore
71
+ if grade == "yes":
72
+ print("---GRADE: DOCUMENT RELEVANT---")
73
+ filtered_docs.append(d)
74
+ else:
75
+ print("---GRADE: DOCUMENT NOT RELEVANT---")
76
+ continue
77
+ return {"documents": filtered_docs, "query": query}
78
+
79
+
80
+ def generation(state):
81
+ query = state["query"]
82
+ document = state["document"]
83
+
84
+ generation = rag_chain.invoke({"query": query, "context": document})
85
+ return {"query": query, "document": document, "generation": generation}
86
+
87
+
88
+ def grade_generation(state):
89
+ query = state["query"]
90
+ document = state["document"]
91
+ generation = state["generation"]
92
+
93
+ score = hallucination_grader.invoke(
94
+ {"document": document, "generation": generation}
95
+ )
96
+ grade = score.binary_score
97
+
98
+ if grade == "yes":
99
+ return {"query": query, "document": document, "generation": generation}
100
+ else:
101
+ return {"query": query, "document": document, "generation": "Hallucination"}
102
+
103
+
104
+ def search_graph():
105
+ workflow = StateGraph(SearchState)
106
+ workflow.add_node("retrieve", retrieve)
107
+ # workflow.add_node("grade_documents", grade_documents)
108
+
109
+ workflow.add_edge(START, "retrieve")
110
+ workflow.add_edge("retrieve", END)
111
+ # workflow.add_edge("retrieve", "grade_documents")
112
+ # workflow.add_edge("grade_documents", END)
113
+ return workflow.compile()
114
+
115
+
116
+ def generation_graph():
117
+ workflow = StateGraph(GenerationState)
118
+ workflow.add_node("gen", generation)
119
+ workflow.add_node("grade_generation", grade_generation)
120
+
121
+ workflow.add_edge(START, "gen")
122
+ workflow.add_edge("gen", "grade_generation")
123
+ workflow.add_edge("grade_generation", END)
124
+ return workflow.compile()
125
+
126
+
127
+ search_app = search_graph()
128
+ thread_id = 42
129
+ q = "What is the capital of France?"
130
+
131
+ out = search_app.invoke({"query": q}, config={"configurable": {"thread_id": thread_id}})
132
+ [d.dict() for d in out["documents"]]
src/model/rag.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain_core.output_parsers import StrOutputParser
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_openai import ChatOpenAI
5
+
6
+ from src.common.settings import cfg
7
+
8
+ _ = load_dotenv()
9
+ human = """
10
+ A user has queried a data catalogue, which has returned a relevant dataset.
11
+
12
+ Explain the relevance of this dataset to the query in under three sentences. Use your own knowledge or the data profile. Do not say it is unrelated; attempt to find a relevant connection.
13
+
14
+ Query: "{query}"
15
+
16
+ Dataset description:
17
+
18
+ {context}
19
+ """
20
+
21
+ gen_prompt = ChatPromptTemplate.from_messages([("human", human)])
22
+ llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
23
+ rag_chain = gen_prompt | llm | StrOutputParser()
src/search_api/__init__.py ADDED
File without changes
src/search_api/api.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from uuid import UUID, uuid4
2
+
3
+ from fastapi import FastAPI
4
+ from langchain_core.documents import Document
5
+
6
+ from src.model.model import generation_graph, search_graph
7
+
8
+ app = FastAPI()
9
+ document_store = {}
10
+ query_mapping = {}
11
+
12
+
13
+ search_app = search_graph()
14
+ gen_app = generation_graph()
15
+
16
+
17
+ @app.get("/")
18
+ def index():
19
+ return {"message": "Make a post request to /query."}
20
+
21
+
22
+ @app.post("/query")
23
+ async def query(q: str) -> dict:
24
+ thread_id = uuid4()
25
+ out = search_app.invoke(
26
+ {"query": q}, config={"configurable": {"thread_id": thread_id}}
27
+ )
28
+ docs_dict = [d.dict() for d in out["documents"]]
29
+ document_store[thread_id] = docs_dict
30
+ query_mapping[thread_id] = q
31
+ return {"thread_id": thread_id, "query": q, "documents": docs_dict}
32
+
33
+
34
+ @app.get("/explain/{thread_id}")
35
+ async def explain(thread_id: UUID, docid: int) -> dict:
36
+ doc_dict = document_store[thread_id][docid]
37
+ document = Document(
38
+ page_content=doc_dict["page_content"],
39
+ metadata=doc_dict["metadata"],
40
+ )
41
+ query = query_mapping[thread_id]
42
+ generation_state = gen_app.invoke(
43
+ {"query": query, "document": document},
44
+ config={"configurable": {"thread_id": thread_id}},
45
+ )
46
+ generation = generation_state["generation"]
47
+
48
+ return {
49
+ "generation": generation,
50
+ "metadata": {
51
+ "thread_id": thread_id,
52
+ "query": query,
53
+ "related_dataset": doc_dict,
54
+ },
55
+ }
src/search_api/streamlit_app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from subprocess import Popen
2
+ from time import sleep
3
+
4
+ import requests
5
+ import streamlit as st
6
+
7
+
8
+ def main():
9
+ st.title("CDRC Semantic Search App")
10
+
11
+ with st.spinner("Loading..."):
12
+ while True:
13
+ try:
14
+ r = requests.get("http://localhost:8000/")
15
+ if r.status_code == 200:
16
+ break
17
+ except requests.exceptions.ConnectionError:
18
+ Popen(["uvicorn", "search_service.api:app", "--port", "8000"])
19
+ sleep(10)
20
+
21
+ # use_llm = st.toggle("Activate LLM")
22
+ use_llm = False
23
+ text = st.text_input("Query")
24
+ if text == "":
25
+ return None
26
+
27
+ r = requests.get(
28
+ "http://localhost:8000/query", params={"q": text, "use_llm": use_llm}
29
+ )
30
+ if r.status_code != 200:
31
+ st.error("No results :(")
32
+ return None
33
+
34
+ if use_llm:
35
+ response, metadata = r.json()
36
+ responses = []
37
+ for r in response["response"].split("---------------------"):
38
+ if all(x in r for x in ["Summary: ", "Relevance: "]):
39
+ responses.append(r)
40
+ else:
41
+ responses.append(None)
42
+
43
+ for res, meta in zip(responses, metadata.values(), strict=False):
44
+ st.subheader(meta["title"])
45
+ if res:
46
+ summary, relevance = res.split("Summary: ")[1].split("Relevance: ")
47
+ st.caption(summary)
48
+ st.caption(f":red[{relevance}]")
49
+ # st.caption(f"Score: :red[{meta['score']:.3f}]")
50
+ else:
51
+ st.caption("LLM did not return a response.")
52
+ if meta["url"] != "None":
53
+ st.write(meta["url"])
54
+ st.divider()
55
+ else:
56
+ metadata = r.json()
57
+ for meta in metadata:
58
+ st.subheader(meta["title"])
59
+ st.caption(f"Score: :red[{meta['score']:.3f}]")
60
+ if meta["url"] != "None":
61
+ st.write(meta["url"])
62
+ st.divider()
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
src/sensors.py DELETED
@@ -1,2 +0,0 @@
1
- # see https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors
2
- # cursor to check which files are new