Spaces:
Sleeping
Sleeping
refactor to use langgraph + dagster
Browse files- Containerfile +12 -0
- compose.yml +99 -0
- config/config.toml +10 -0
- dagster.yaml +52 -0
- pyproject.toml +18 -10
- reports/DOCS.md +72 -0
- reports/figs/system.png +0 -0
- requirements-dev.lock +119 -42
- requirements.lock +116 -39
- src/__init__.py +0 -14
- src/assets/datastore.py +0 -48
- src/common/logging.py +3 -0
- src/common/settings.py +28 -0
- src/common/utils.py +14 -5
- src/datastore/__init__.py +18 -0
- src/{assets → datastore/assets}/adr.py +43 -17
- src/datastore/assets/cdrc.py +113 -0
- src/datastore/assets/datastore.py +99 -0
- src/datastore/assets/ukds.py +122 -0
- src/{jobs.py → datastore/jobs.py} +14 -1
- src/datastore/loaders.py +154 -0
- src/{resources.py → datastore/resources.py} +0 -0
- src/datastore/schedules.py +22 -0
- src/model/__init__.py +0 -0
- src/model/answer.py +32 -0
- src/model/dag.py +79 -0
- src/model/grader.py +34 -0
- src/model/hallucination.py +32 -0
- src/model/model.py +132 -0
- src/model/rag.py +23 -0
- src/search_api/__init__.py +0 -0
- src/search_api/api.py +55 -0
- src/search_api/streamlit_app.py +66 -0
- src/sensors.py +0 -2
Containerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12
|
| 2 |
+
|
| 3 |
+
ENV VIRTUAL_ENV=/usr/local
|
| 4 |
+
|
| 5 |
+
ENV DAGSTER_HOME=/opt/dagster/dagster_home/
|
| 6 |
+
RUN mkdir -p $DAGSTER_HOME /opt/dagster/app
|
| 7 |
+
COPY dagster.yaml $DAGSTER_HOME
|
| 8 |
+
|
| 9 |
+
WORKDIR /opt/dagster/app
|
| 10 |
+
|
| 11 |
+
COPY requirements.lock pyproject.toml .env README.md ./
|
| 12 |
+
RUN pip install --no-cache-dir -r requirements.lock
|
compose.yml
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
datastore-postgresql:
|
| 3 |
+
image: postgres:11
|
| 4 |
+
container_name: datastore-postgresql
|
| 5 |
+
environment:
|
| 6 |
+
POSTGRES_USER: "postgres_user"
|
| 7 |
+
POSTGRES_PASSWORD: "postgres_password"
|
| 8 |
+
POSTGRES_DB: "postgres_db"
|
| 9 |
+
networks:
|
| 10 |
+
- datastore-network
|
| 11 |
+
|
| 12 |
+
datastore-code:
|
| 13 |
+
container_name: datastore-code
|
| 14 |
+
restart: always
|
| 15 |
+
build:
|
| 16 |
+
context: .
|
| 17 |
+
dockerfile: Containerfile
|
| 18 |
+
image: datastore
|
| 19 |
+
security_opt:
|
| 20 |
+
- "label:disable"
|
| 21 |
+
entrypoint:
|
| 22 |
+
- dagster
|
| 23 |
+
- api
|
| 24 |
+
- grpc
|
| 25 |
+
- -h
|
| 26 |
+
- "0.0.0.0"
|
| 27 |
+
- -p
|
| 28 |
+
- "4000"
|
| 29 |
+
- -m
|
| 30 |
+
- src.datastore
|
| 31 |
+
environment:
|
| 32 |
+
DAGSTER_POSTGRES_USER: "postgres_user"
|
| 33 |
+
DAGSTER_POSTGRES_PASSWORD: "postgres_password"
|
| 34 |
+
DAGSTER_POSTGRES_DB: "postgres_db"
|
| 35 |
+
DAGSTER_CURRENT_IMAGE: "datastore"
|
| 36 |
+
volumes:
|
| 37 |
+
- ./src:/opt/dagster/app/src
|
| 38 |
+
- ./config:/opt/dagster/app/config
|
| 39 |
+
- ./data:/opt/dagster/app/data
|
| 40 |
+
networks:
|
| 41 |
+
- datastore-network
|
| 42 |
+
|
| 43 |
+
datastore-server:
|
| 44 |
+
container_name: datastore-server
|
| 45 |
+
build:
|
| 46 |
+
context: .
|
| 47 |
+
dockerfile: Containerfile
|
| 48 |
+
entrypoint:
|
| 49 |
+
- dagster-webserver
|
| 50 |
+
- -h
|
| 51 |
+
- "0.0.0.0"
|
| 52 |
+
- -p
|
| 53 |
+
- "3000"
|
| 54 |
+
expose:
|
| 55 |
+
- "3000"
|
| 56 |
+
ports:
|
| 57 |
+
- 3000:3000
|
| 58 |
+
security_opt:
|
| 59 |
+
- "label:disable"
|
| 60 |
+
environment:
|
| 61 |
+
DAGSTER_POSTGRES_USER: "postgres_user"
|
| 62 |
+
DAGSTER_POSTGRES_PASSWORD: "postgres_password"
|
| 63 |
+
DAGSTER_POSTGRES_DB: "postgres_db"
|
| 64 |
+
volumes:
|
| 65 |
+
- ./src:/opt/dagster/app/src
|
| 66 |
+
- ./config:/opt/dagster/app/config
|
| 67 |
+
- ./data:/opt/dagster/app/data
|
| 68 |
+
networks:
|
| 69 |
+
- datastore-network
|
| 70 |
+
|
| 71 |
+
datastore-daemon:
|
| 72 |
+
container_name: datastore-daemon
|
| 73 |
+
restart: on-failure
|
| 74 |
+
build:
|
| 75 |
+
context: .
|
| 76 |
+
dockerfile: Containerfile
|
| 77 |
+
entrypoint:
|
| 78 |
+
- dagster-daemon
|
| 79 |
+
- run
|
| 80 |
+
security_opt:
|
| 81 |
+
- "label:disable"
|
| 82 |
+
environment:
|
| 83 |
+
DAGSTER_POSTGRES_USER: "postgres_user"
|
| 84 |
+
DAGSTER_POSTGRES_PASSWORD: "postgres_password"
|
| 85 |
+
DAGSTER_POSTGRES_DB: "postgres_db"
|
| 86 |
+
volumes:
|
| 87 |
+
- ./src:/opt/dagster/app/src
|
| 88 |
+
- ./config:/opt/dagster/app/config
|
| 89 |
+
- ./data:/opt/dagster/app/data
|
| 90 |
+
networks:
|
| 91 |
+
- datastore-network
|
| 92 |
+
depends_on:
|
| 93 |
+
- datastore-postgresql
|
| 94 |
+
- datastore-code
|
| 95 |
+
|
| 96 |
+
networks:
|
| 97 |
+
datastore-network:
|
| 98 |
+
driver: bridge
|
| 99 |
+
name: datastore-network
|
config/config.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[datastore]
|
| 2 |
+
index_name = "data-catalogue"
|
| 3 |
+
embed_model = "text-embedding-3-large"
|
| 4 |
+
embed_dim = 3072
|
| 5 |
+
chunk_size = 256
|
| 6 |
+
chunk_overlap = 32
|
| 7 |
+
|
| 8 |
+
[model]
|
| 9 |
+
llm = "gpt-3.5-turbo"
|
| 10 |
+
top_k = 30
|
dagster.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scheduler:
|
| 2 |
+
module: dagster.core.scheduler
|
| 3 |
+
class: DagsterDaemonScheduler
|
| 4 |
+
|
| 5 |
+
run_retries:
|
| 6 |
+
enabled: true
|
| 7 |
+
max_retries: 3
|
| 8 |
+
|
| 9 |
+
run_storage:
|
| 10 |
+
module: dagster_postgres.run_storage
|
| 11 |
+
class: PostgresRunStorage
|
| 12 |
+
config:
|
| 13 |
+
postgres_db:
|
| 14 |
+
hostname: datastore-postgresql
|
| 15 |
+
username:
|
| 16 |
+
env: DAGSTER_POSTGRES_USER
|
| 17 |
+
password:
|
| 18 |
+
env: DAGSTER_POSTGRES_PASSWORD
|
| 19 |
+
db_name:
|
| 20 |
+
env: DAGSTER_POSTGRES_DB
|
| 21 |
+
port: 5432
|
| 22 |
+
|
| 23 |
+
schedule_storage:
|
| 24 |
+
module: dagster_postgres.schedule_storage
|
| 25 |
+
class: PostgresScheduleStorage
|
| 26 |
+
config:
|
| 27 |
+
postgres_db:
|
| 28 |
+
hostname: datastore-postgresql
|
| 29 |
+
username:
|
| 30 |
+
env: DAGSTER_POSTGRES_USER
|
| 31 |
+
password:
|
| 32 |
+
env: DAGSTER_POSTGRES_PASSWORD
|
| 33 |
+
db_name:
|
| 34 |
+
env: DAGSTER_POSTGRES_DB
|
| 35 |
+
port: 5432
|
| 36 |
+
|
| 37 |
+
event_log_storage:
|
| 38 |
+
module: dagster_postgres.event_log
|
| 39 |
+
class: PostgresEventLogStorage
|
| 40 |
+
config:
|
| 41 |
+
postgres_db:
|
| 42 |
+
hostname: datastore-postgresql
|
| 43 |
+
username:
|
| 44 |
+
env: DAGSTER_POSTGRES_USER
|
| 45 |
+
password:
|
| 46 |
+
env: DAGSTER_POSTGRES_PASSWORD
|
| 47 |
+
db_name:
|
| 48 |
+
env: DAGSTER_POSTGRES_DB
|
| 49 |
+
port: 5432
|
| 50 |
+
|
| 51 |
+
telemetry:
|
| 52 |
+
enabled: false
|
pyproject.toml
CHANGED
|
@@ -6,18 +6,26 @@ authors = [
|
|
| 6 |
{ name = "cjber", email = "cjberragan@gmail.com" }
|
| 7 |
]
|
| 8 |
dependencies = [
|
| 9 |
-
"requests>=2.31.0",
|
| 10 |
-
"langchain>=0.2.0",
|
| 11 |
-
"langchain-community>=0.2.0",
|
| 12 |
-
"langchain-openai>=0.1.7",
|
| 13 |
-
"langchain-pinecone>=0.1.1",
|
| 14 |
-
"polars>=0.20.25",
|
| 15 |
-
# "duckdb>=0.10.2",
|
| 16 |
-
# "duckdb-engine>=0.12.0",
|
| 17 |
-
"python-dotenv>=1.0.1",
|
| 18 |
"dagster>=1.7.8",
|
| 19 |
"dagster-webserver>=1.7.8",
|
|
|
|
| 20 |
"dagster-openai>=0.23.9",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
]
|
| 22 |
readme = "README.md"
|
| 23 |
requires-python = ">= 3.8"
|
|
@@ -41,4 +49,4 @@ allow-direct-references = true
|
|
| 41 |
packages = ["src/semantic_catalogue"]
|
| 42 |
|
| 43 |
[tool.dagster]
|
| 44 |
-
module_name = "src"
|
|
|
|
| 6 |
{ name = "cjber", email = "cjberragan@gmail.com" }
|
| 7 |
]
|
| 8 |
dependencies = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"dagster>=1.7.8",
|
| 10 |
"dagster-webserver>=1.7.8",
|
| 11 |
+
"dagster-postgres>=0.23.15",
|
| 12 |
"dagster-openai>=0.23.9",
|
| 13 |
+
"requests>=2.31.0",
|
| 14 |
+
"polars>=0.20.25",
|
| 15 |
+
"python-dotenv>=1.0.1",
|
| 16 |
+
"sickle>=0.7.0",
|
| 17 |
+
"lxml>=5.2.2",
|
| 18 |
+
"pydantic-settings>=2.3.4",
|
| 19 |
+
"pdfminer-six>=20240706",
|
| 20 |
+
"dateparser>=1.2.0",
|
| 21 |
+
"langchain>=0.2.12",
|
| 22 |
+
"langchain-community>=0.2.10",
|
| 23 |
+
"langchain-pinecone>=0.1.3",
|
| 24 |
+
"langchain-openai>=0.1.20",
|
| 25 |
+
"langchain-experimental>=0.0.63",
|
| 26 |
+
"langgraph>=0.1.19",
|
| 27 |
+
"langchainhub>=0.1.20",
|
| 28 |
+
"fastapi[standard]>=0.112.0",
|
| 29 |
]
|
| 30 |
readme = "README.md"
|
| 31 |
requires-python = ">= 3.8"
|
|
|
|
| 49 |
packages = ["src/semantic_catalogue"]
|
| 50 |
|
| 51 |
[tool.dagster]
|
| 52 |
+
module_name = "src.datastore"
|
reports/DOCS.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Introduction
|
| 2 |
+
|
| 3 |
+
* Unify search across catalogues
|
| 4 |
+
* Uses semantic search with RAG for results explainability
|
| 5 |
+
* Llama Index framework
|
| 6 |
+
|
| 7 |
+
# Methodology
|
| 8 |
+
|
| 9 |
+
## Pre-processing
|
| 10 |
+
|
| 11 |
+
For each catalogue their respective API was used to return dataset metadata. Each returned result contained descriptive information regarding datasets, which form the bulk of text data used by the semantic search system to return results. For the CDRC catalogue, PDFs were also processed to extract text. Other metadata was also returned which may be used by the final system; for example, data creation date.
|
| 12 |
+
|
| 13 |
+
## Datastore
|
| 14 |
+
|
| 15 |
+
The description of each dataset were then saved into individual text files, identifiable by a unique ID. These files were then embedded using OpenAI embeddings, and uploaded to the Pinecone database, alongside any metadata. Descriptions were 'chunked' into individual segments 1024 tokens in length. For each chunk, the dataset title as embedded at the start.
|
| 16 |
+
|
| 17 |
+
## RAG Model
|
| 18 |
+
|
| 19 |
+
A RAG system was then built which embeds a user query using the same embedding model, and returns the top 'k' results ranked by cosine similarity from the Pinecone database. To ensure that results are ranked by dataset, a custom document grouping postprocessor was defined, which grouped all document chunks relating to the same dataset. The highest score from any chunk is used to rank grouped documents.
|
| 20 |
+
|
| 21 |
+
An adjustable 'alpha' value was used to allow for a mixture of traditional 'sparse vector' search (e.g. BM25: keyword search), and the 'dense vector' search, using the LLM embeddings.
|
| 22 |
+
|
| 23 |
+
For each unique document returned, an explainable 'Ask AI' option was added, which feeds the grouped document into a GPT LLM with the following prompt:
|
| 24 |
+
|
| 25 |
+
```python
|
| 26 |
+
prompt = """
|
| 27 |
+
Below is a dataset description that is relevant to a researchers query.
|
| 28 |
+
|
| 29 |
+
Explain the relevance of this dataset to the query in under 50 words. Use your own knowledge or the data profile. Do not say it is unrelated; attempt to find a relevant connection.
|
| 30 |
+
|
| 31 |
+
---------------------
|
| 32 |
+
Query: "{query_str}"
|
| 33 |
+
|
| 34 |
+
Dataset description:
|
| 35 |
+
|
| 36 |
+
{context_str}
|
| 37 |
+
---------------------
|
| 38 |
+
"""
|
| 39 |
+
```
|
| 40 |
+
This approach ensures that users receive not only relevant search results but also understandable explanations regarding the relevance of each dataset to their query.
|
| 41 |
+
|
| 42 |
+
# System architecture
|
| 43 |
+
|
| 44 |
+
## Overview
|
| 45 |
+
|
| 46 |
+

|
| 47 |
+
|
| 48 |
+
## Data flow
|
| 49 |
+
|
| 50 |
+
(Describe the flow of data from the catalogues to the end-user.)
|
| 51 |
+
|
| 52 |
+
## Implementation details
|
| 53 |
+
|
| 54 |
+
* Tools and Libraries: OpenAI API, Pinecone, Llama Index
|
| 55 |
+
* Challenges: (Detail any challenges and solutions.)
|
| 56 |
+
|
| 57 |
+
# Evaluation and results
|
| 58 |
+
|
| 59 |
+
* Performance Metrics: Search accuracy, response time, user feedback
|
| 60 |
+
* Comparison: Effectiveness of keyword search vs. dense vector search
|
| 61 |
+
|
| 62 |
+
# Future work and improvements
|
| 63 |
+
|
| 64 |
+
* Potential improvements and future enhancements
|
| 65 |
+
* Discuss limitations of the current implementation
|
| 66 |
+
|
| 67 |
+
# Conclusion
|
| 68 |
+
|
| 69 |
+
Summarise the key points and the impact of the unified search system.
|
| 70 |
+
References
|
| 71 |
+
|
| 72 |
+
(List any academic papers, tools, or libraries referenced.)
|
reports/figs/system.png
ADDED
|
requirements-dev.lock
CHANGED
|
@@ -7,14 +7,16 @@
|
|
| 7 |
# all-features: false
|
| 8 |
# with-sources: false
|
| 9 |
# generate-hashes: false
|
|
|
|
| 10 |
|
| 11 |
-e file:.
|
| 12 |
aiohttp==3.9.5
|
| 13 |
# via langchain
|
| 14 |
# via langchain-community
|
|
|
|
| 15 |
aiosignal==1.3.1
|
| 16 |
# via aiohttp
|
| 17 |
-
alembic==1.13.
|
| 18 |
# via dagster
|
| 19 |
aniso8601==9.0.1
|
| 20 |
# via graphene
|
|
@@ -32,51 +34,70 @@ attrs==23.2.0
|
|
| 32 |
# via aiohttp
|
| 33 |
backoff==2.2.1
|
| 34 |
# via gql
|
| 35 |
-
certifi==2024.
|
| 36 |
# via httpcore
|
| 37 |
# via httpx
|
| 38 |
# via pinecone-client
|
| 39 |
# via requests
|
|
|
|
|
|
|
| 40 |
charset-normalizer==3.3.2
|
|
|
|
| 41 |
# via requests
|
| 42 |
click==8.1.7
|
| 43 |
# via dagster
|
| 44 |
# via dagster-webserver
|
|
|
|
| 45 |
# via uvicorn
|
| 46 |
coloredlogs==14.0
|
| 47 |
# via dagster
|
| 48 |
-
croniter==
|
| 49 |
# via dagster
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
# via dagster-graphql
|
| 52 |
# via dagster-openai
|
|
|
|
| 53 |
# via dagster-webserver
|
| 54 |
# via semantic-catalogue
|
| 55 |
-
dagster-graphql==1.7.
|
| 56 |
# via dagster-webserver
|
| 57 |
-
dagster-openai==0.23.
|
| 58 |
# via semantic-catalogue
|
| 59 |
-
dagster-pipes==1.7.
|
| 60 |
# via dagster
|
| 61 |
-
dagster-
|
|
|
|
|
|
|
| 62 |
# via semantic-catalogue
|
| 63 |
-
dataclasses-json==0.6.
|
| 64 |
# via langchain-community
|
|
|
|
|
|
|
| 65 |
decorator==5.1.1
|
| 66 |
# via ipdb
|
| 67 |
# via ipython
|
| 68 |
distro==1.9.0
|
| 69 |
# via openai
|
|
|
|
|
|
|
| 70 |
docstring-parser==0.16
|
| 71 |
# via dagster
|
|
|
|
|
|
|
| 72 |
executing==2.0.1
|
| 73 |
# via stack-data
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# via dagster
|
| 76 |
frozenlist==1.4.1
|
| 77 |
# via aiohttp
|
| 78 |
# via aiosignal
|
| 79 |
-
fsspec==2024.
|
| 80 |
# via universal-pathlib
|
| 81 |
gql==3.5.0
|
| 82 |
# via dagster-graphql
|
|
@@ -90,7 +111,7 @@ graphql-relay==3.2.0
|
|
| 90 |
# via graphene
|
| 91 |
greenlet==3.0.3
|
| 92 |
# via sqlalchemy
|
| 93 |
-
grpcio==1.64.
|
| 94 |
# via dagster
|
| 95 |
# via grpcio-health-checking
|
| 96 |
grpcio-health-checking==1.62.2
|
|
@@ -103,46 +124,61 @@ httpcore==1.0.5
|
|
| 103 |
httptools==0.6.1
|
| 104 |
# via uvicorn
|
| 105 |
httpx==0.27.0
|
|
|
|
| 106 |
# via openai
|
| 107 |
humanfriendly==10.0
|
| 108 |
# via coloredlogs
|
| 109 |
idna==3.7
|
| 110 |
# via anyio
|
|
|
|
| 111 |
# via httpx
|
| 112 |
# via requests
|
| 113 |
# via yarl
|
| 114 |
ipdb==0.13.13
|
| 115 |
-
ipython==8.
|
| 116 |
# via ipdb
|
| 117 |
jedi==0.19.1
|
| 118 |
# via ipython
|
| 119 |
jinja2==3.1.4
|
| 120 |
# via dagster
|
|
|
|
| 121 |
jsonpatch==1.33
|
| 122 |
# via langchain-core
|
| 123 |
-
jsonpointer==
|
| 124 |
# via jsonpatch
|
| 125 |
-
langchain==0.2.
|
| 126 |
# via langchain-community
|
| 127 |
# via semantic-catalogue
|
| 128 |
-
langchain-community==0.2.
|
|
|
|
| 129 |
# via semantic-catalogue
|
| 130 |
-
langchain-core==0.2.
|
| 131 |
# via langchain
|
| 132 |
# via langchain-community
|
|
|
|
| 133 |
# via langchain-openai
|
| 134 |
# via langchain-pinecone
|
| 135 |
# via langchain-text-splitters
|
| 136 |
-
|
|
|
|
| 137 |
# via semantic-catalogue
|
| 138 |
-
langchain-
|
| 139 |
# via semantic-catalogue
|
| 140 |
-
langchain-
|
|
|
|
|
|
|
| 141 |
# via langchain
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
# via langchain
|
| 144 |
# via langchain-community
|
| 145 |
# via langchain-core
|
|
|
|
|
|
|
|
|
|
| 146 |
mako==1.3.5
|
| 147 |
# via alembic
|
| 148 |
markdown-it-py==3.0.0
|
|
@@ -150,7 +186,7 @@ markdown-it-py==3.0.0
|
|
| 150 |
markupsafe==2.1.5
|
| 151 |
# via jinja2
|
| 152 |
# via mako
|
| 153 |
-
marshmallow==3.21.
|
| 154 |
# via dataclasses-json
|
| 155 |
matplotlib-inline==0.1.7
|
| 156 |
# via ipython
|
|
@@ -165,64 +201,85 @@ numpy==1.26.4
|
|
| 165 |
# via langchain
|
| 166 |
# via langchain-community
|
| 167 |
# via langchain-pinecone
|
| 168 |
-
openai==1.
|
| 169 |
# via dagster-openai
|
| 170 |
# via langchain-openai
|
| 171 |
-
orjson==3.10.
|
| 172 |
# via langsmith
|
| 173 |
-
packaging==
|
| 174 |
# via dagster
|
| 175 |
# via langchain-core
|
|
|
|
| 176 |
# via marshmallow
|
| 177 |
parso==0.8.4
|
| 178 |
# via jedi
|
|
|
|
|
|
|
| 179 |
pendulum==3.0.0
|
| 180 |
# via dagster
|
| 181 |
pexpect==4.9.0
|
| 182 |
# via ipython
|
| 183 |
-
pinecone-client==
|
| 184 |
# via langchain-pinecone
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
# via semantic-catalogue
|
| 187 |
-
prompt-toolkit==3.0.
|
| 188 |
# via ipython
|
| 189 |
-
protobuf==4.25.
|
| 190 |
# via dagster
|
| 191 |
# via grpcio-health-checking
|
|
|
|
|
|
|
| 192 |
ptyprocess==0.7.0
|
| 193 |
# via pexpect
|
| 194 |
-
pure-eval==0.2.
|
| 195 |
# via stack-data
|
| 196 |
-
|
|
|
|
|
|
|
| 197 |
# via dagster
|
|
|
|
| 198 |
# via langchain
|
| 199 |
# via langchain-core
|
| 200 |
# via langsmith
|
| 201 |
# via openai
|
| 202 |
-
pydantic-
|
|
|
|
| 203 |
# via pydantic
|
|
|
|
|
|
|
| 204 |
pygments==2.18.0
|
| 205 |
# via ipython
|
| 206 |
# via rich
|
| 207 |
python-dateutil==2.9.0.post0
|
| 208 |
# via croniter
|
| 209 |
-
# via
|
| 210 |
# via pendulum
|
| 211 |
# via time-machine
|
| 212 |
python-dotenv==1.0.1
|
| 213 |
# via dagster
|
|
|
|
| 214 |
# via semantic-catalogue
|
| 215 |
# via uvicorn
|
|
|
|
|
|
|
| 216 |
pytz==2024.1
|
| 217 |
# via croniter
|
| 218 |
# via dagster
|
|
|
|
| 219 |
pyyaml==6.0.1
|
| 220 |
# via dagster
|
| 221 |
# via langchain
|
| 222 |
# via langchain-community
|
| 223 |
# via langchain-core
|
| 224 |
# via uvicorn
|
| 225 |
-
regex==2024.
|
|
|
|
| 226 |
# via tiktoken
|
| 227 |
requests==2.32.3
|
| 228 |
# via dagster
|
|
@@ -230,16 +287,23 @@ requests==2.32.3
|
|
| 230 |
# via gql
|
| 231 |
# via langchain
|
| 232 |
# via langchain-community
|
|
|
|
| 233 |
# via langsmith
|
| 234 |
# via requests-toolbelt
|
| 235 |
# via semantic-catalogue
|
|
|
|
| 236 |
# via tiktoken
|
| 237 |
requests-toolbelt==1.0.0
|
| 238 |
# via gql
|
| 239 |
rich==13.7.1
|
| 240 |
# via dagster
|
| 241 |
-
|
|
|
|
| 242 |
# via dagster
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
six==1.16.0
|
| 244 |
# via asttokens
|
| 245 |
# via python-dateutil
|
|
@@ -248,7 +312,7 @@ sniffio==1.3.1
|
|
| 248 |
# via httpx
|
| 249 |
# via openai
|
| 250 |
sourcery-cli==1.18.0
|
| 251 |
-
sqlalchemy==2.0.
|
| 252 |
# via alembic
|
| 253 |
# via dagster
|
| 254 |
# via langchain
|
|
@@ -258,17 +322,18 @@ stack-data==0.6.3
|
|
| 258 |
starlette==0.37.2
|
| 259 |
# via dagster-graphql
|
| 260 |
# via dagster-webserver
|
| 261 |
-
|
|
|
|
| 262 |
# via dagster
|
| 263 |
tabulate==0.9.0
|
| 264 |
# via dagster
|
| 265 |
-
tenacity==8.
|
| 266 |
# via langchain
|
| 267 |
# via langchain-community
|
| 268 |
# via langchain-core
|
| 269 |
tiktoken==0.7.0
|
| 270 |
# via langchain-openai
|
| 271 |
-
time-machine==2.14.
|
| 272 |
# via pendulum
|
| 273 |
tomli==2.0.1
|
| 274 |
# via dagster
|
|
@@ -281,26 +346,38 @@ tqdm==4.66.4
|
|
| 281 |
traitlets==5.14.3
|
| 282 |
# via ipython
|
| 283 |
# via matplotlib-inline
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
# via alembic
|
| 286 |
# via dagster
|
|
|
|
|
|
|
| 287 |
# via openai
|
| 288 |
# via pinecone-client
|
| 289 |
# via pydantic
|
| 290 |
# via pydantic-core
|
| 291 |
# via sqlalchemy
|
|
|
|
| 292 |
# via typing-inspect
|
| 293 |
typing-inspect==0.9.0
|
| 294 |
# via dataclasses-json
|
| 295 |
tzdata==2024.1
|
| 296 |
# via pendulum
|
|
|
|
|
|
|
| 297 |
universal-pathlib==0.2.2
|
| 298 |
# via dagster
|
| 299 |
-
urllib3==2.2.
|
| 300 |
# via pinecone-client
|
| 301 |
# via requests
|
| 302 |
-
|
|
|
|
| 303 |
# via dagster-webserver
|
|
|
|
|
|
|
| 304 |
uvloop==0.19.0
|
| 305 |
# via uvicorn
|
| 306 |
watchdog==4.0.1
|
|
|
|
| 7 |
# all-features: false
|
| 8 |
# with-sources: false
|
| 9 |
# generate-hashes: false
|
| 10 |
+
# universal: false
|
| 11 |
|
| 12 |
-e file:.
|
| 13 |
aiohttp==3.9.5
|
| 14 |
# via langchain
|
| 15 |
# via langchain-community
|
| 16 |
+
# via langchain-pinecone
|
| 17 |
aiosignal==1.3.1
|
| 18 |
# via aiohttp
|
| 19 |
+
alembic==1.13.2
|
| 20 |
# via dagster
|
| 21 |
aniso8601==9.0.1
|
| 22 |
# via graphene
|
|
|
|
| 34 |
# via aiohttp
|
| 35 |
backoff==2.2.1
|
| 36 |
# via gql
|
| 37 |
+
certifi==2024.7.4
|
| 38 |
# via httpcore
|
| 39 |
# via httpx
|
| 40 |
# via pinecone-client
|
| 41 |
# via requests
|
| 42 |
+
cffi==1.16.0
|
| 43 |
+
# via cryptography
|
| 44 |
charset-normalizer==3.3.2
|
| 45 |
+
# via pdfminer-six
|
| 46 |
# via requests
|
| 47 |
click==8.1.7
|
| 48 |
# via dagster
|
| 49 |
# via dagster-webserver
|
| 50 |
+
# via typer
|
| 51 |
# via uvicorn
|
| 52 |
coloredlogs==14.0
|
| 53 |
# via dagster
|
| 54 |
+
croniter==3.0.3
|
| 55 |
# via dagster
|
| 56 |
+
cryptography==43.0.0
|
| 57 |
+
# via pdfminer-six
|
| 58 |
+
dagster==1.7.15
|
| 59 |
# via dagster-graphql
|
| 60 |
# via dagster-openai
|
| 61 |
+
# via dagster-postgres
|
| 62 |
# via dagster-webserver
|
| 63 |
# via semantic-catalogue
|
| 64 |
+
dagster-graphql==1.7.15
|
| 65 |
# via dagster-webserver
|
| 66 |
+
dagster-openai==0.23.15
|
| 67 |
# via semantic-catalogue
|
| 68 |
+
dagster-pipes==1.7.15
|
| 69 |
# via dagster
|
| 70 |
+
dagster-postgres==0.23.15
|
| 71 |
+
# via semantic-catalogue
|
| 72 |
+
dagster-webserver==1.7.15
|
| 73 |
# via semantic-catalogue
|
| 74 |
+
dataclasses-json==0.6.7
|
| 75 |
# via langchain-community
|
| 76 |
+
dateparser==1.2.0
|
| 77 |
+
# via semantic-catalogue
|
| 78 |
decorator==5.1.1
|
| 79 |
# via ipdb
|
| 80 |
# via ipython
|
| 81 |
distro==1.9.0
|
| 82 |
# via openai
|
| 83 |
+
dnspython==2.6.1
|
| 84 |
+
# via email-validator
|
| 85 |
docstring-parser==0.16
|
| 86 |
# via dagster
|
| 87 |
+
email-validator==2.2.0
|
| 88 |
+
# via fastapi
|
| 89 |
executing==2.0.1
|
| 90 |
# via stack-data
|
| 91 |
+
fastapi==0.112.0
|
| 92 |
+
# via semantic-catalogue
|
| 93 |
+
fastapi-cli==0.0.5
|
| 94 |
+
# via fastapi
|
| 95 |
+
filelock==3.15.4
|
| 96 |
# via dagster
|
| 97 |
frozenlist==1.4.1
|
| 98 |
# via aiohttp
|
| 99 |
# via aiosignal
|
| 100 |
+
fsspec==2024.6.1
|
| 101 |
# via universal-pathlib
|
| 102 |
gql==3.5.0
|
| 103 |
# via dagster-graphql
|
|
|
|
| 111 |
# via graphene
|
| 112 |
greenlet==3.0.3
|
| 113 |
# via sqlalchemy
|
| 114 |
+
grpcio==1.64.1
|
| 115 |
# via dagster
|
| 116 |
# via grpcio-health-checking
|
| 117 |
grpcio-health-checking==1.62.2
|
|
|
|
| 124 |
httptools==0.6.1
|
| 125 |
# via uvicorn
|
| 126 |
httpx==0.27.0
|
| 127 |
+
# via fastapi
|
| 128 |
# via openai
|
| 129 |
humanfriendly==10.0
|
| 130 |
# via coloredlogs
|
| 131 |
idna==3.7
|
| 132 |
# via anyio
|
| 133 |
+
# via email-validator
|
| 134 |
# via httpx
|
| 135 |
# via requests
|
| 136 |
# via yarl
|
| 137 |
ipdb==0.13.13
|
| 138 |
+
ipython==8.26.0
|
| 139 |
# via ipdb
|
| 140 |
jedi==0.19.1
|
| 141 |
# via ipython
|
| 142 |
jinja2==3.1.4
|
| 143 |
# via dagster
|
| 144 |
+
# via fastapi
|
| 145 |
jsonpatch==1.33
|
| 146 |
# via langchain-core
|
| 147 |
+
jsonpointer==3.0.0
|
| 148 |
# via jsonpatch
|
| 149 |
+
langchain==0.2.12
|
| 150 |
# via langchain-community
|
| 151 |
# via semantic-catalogue
|
| 152 |
+
langchain-community==0.2.10
|
| 153 |
+
# via langchain-experimental
|
| 154 |
# via semantic-catalogue
|
| 155 |
+
langchain-core==0.2.27
|
| 156 |
# via langchain
|
| 157 |
# via langchain-community
|
| 158 |
+
# via langchain-experimental
|
| 159 |
# via langchain-openai
|
| 160 |
# via langchain-pinecone
|
| 161 |
# via langchain-text-splitters
|
| 162 |
+
# via langgraph
|
| 163 |
+
langchain-experimental==0.0.63
|
| 164 |
# via semantic-catalogue
|
| 165 |
+
langchain-openai==0.1.20
|
| 166 |
# via semantic-catalogue
|
| 167 |
+
langchain-pinecone==0.1.3
|
| 168 |
+
# via semantic-catalogue
|
| 169 |
+
langchain-text-splitters==0.2.2
|
| 170 |
# via langchain
|
| 171 |
+
langchainhub==0.1.20
|
| 172 |
+
# via semantic-catalogue
|
| 173 |
+
langgraph==0.1.19
|
| 174 |
+
# via semantic-catalogue
|
| 175 |
+
langsmith==0.1.96
|
| 176 |
# via langchain
|
| 177 |
# via langchain-community
|
| 178 |
# via langchain-core
|
| 179 |
+
lxml==5.2.2
|
| 180 |
+
# via semantic-catalogue
|
| 181 |
+
# via sickle
|
| 182 |
mako==1.3.5
|
| 183 |
# via alembic
|
| 184 |
markdown-it-py==3.0.0
|
|
|
|
| 186 |
markupsafe==2.1.5
|
| 187 |
# via jinja2
|
| 188 |
# via mako
|
| 189 |
+
marshmallow==3.21.3
|
| 190 |
# via dataclasses-json
|
| 191 |
matplotlib-inline==0.1.7
|
| 192 |
# via ipython
|
|
|
|
| 201 |
# via langchain
|
| 202 |
# via langchain-community
|
| 203 |
# via langchain-pinecone
|
| 204 |
+
openai==1.37.1
|
| 205 |
# via dagster-openai
|
| 206 |
# via langchain-openai
|
| 207 |
+
orjson==3.10.6
|
| 208 |
# via langsmith
|
| 209 |
+
packaging==24.1
|
| 210 |
# via dagster
|
| 211 |
# via langchain-core
|
| 212 |
+
# via langchainhub
|
| 213 |
# via marshmallow
|
| 214 |
parso==0.8.4
|
| 215 |
# via jedi
|
| 216 |
+
pdfminer-six==20240706
|
| 217 |
+
# via semantic-catalogue
|
| 218 |
pendulum==3.0.0
|
| 219 |
# via dagster
|
| 220 |
pexpect==4.9.0
|
| 221 |
# via ipython
|
| 222 |
+
pinecone-client==5.0.1
|
| 223 |
# via langchain-pinecone
|
| 224 |
+
pinecone-plugin-inference==1.0.3
|
| 225 |
+
# via pinecone-client
|
| 226 |
+
pinecone-plugin-interface==0.0.7
|
| 227 |
+
# via pinecone-client
|
| 228 |
+
# via pinecone-plugin-inference
|
| 229 |
+
polars==1.3.0
|
| 230 |
# via semantic-catalogue
|
| 231 |
+
prompt-toolkit==3.0.47
|
| 232 |
# via ipython
|
| 233 |
+
protobuf==4.25.4
|
| 234 |
# via dagster
|
| 235 |
# via grpcio-health-checking
|
| 236 |
+
psycopg2-binary==2.9.9
|
| 237 |
+
# via dagster-postgres
|
| 238 |
ptyprocess==0.7.0
|
| 239 |
# via pexpect
|
| 240 |
+
pure-eval==0.2.3
|
| 241 |
# via stack-data
|
| 242 |
+
pycparser==2.22
|
| 243 |
+
# via cffi
|
| 244 |
+
pydantic==2.8.2
|
| 245 |
# via dagster
|
| 246 |
+
# via fastapi
|
| 247 |
# via langchain
|
| 248 |
# via langchain-core
|
| 249 |
# via langsmith
|
| 250 |
# via openai
|
| 251 |
+
# via pydantic-settings
|
| 252 |
+
pydantic-core==2.20.1
|
| 253 |
# via pydantic
|
| 254 |
+
pydantic-settings==2.3.4
|
| 255 |
+
# via semantic-catalogue
|
| 256 |
pygments==2.18.0
|
| 257 |
# via ipython
|
| 258 |
# via rich
|
| 259 |
python-dateutil==2.9.0.post0
|
| 260 |
# via croniter
|
| 261 |
+
# via dateparser
|
| 262 |
# via pendulum
|
| 263 |
# via time-machine
|
| 264 |
python-dotenv==1.0.1
|
| 265 |
# via dagster
|
| 266 |
+
# via pydantic-settings
|
| 267 |
# via semantic-catalogue
|
| 268 |
# via uvicorn
|
| 269 |
+
python-multipart==0.0.9
|
| 270 |
+
# via fastapi
|
| 271 |
pytz==2024.1
|
| 272 |
# via croniter
|
| 273 |
# via dagster
|
| 274 |
+
# via dateparser
|
| 275 |
pyyaml==6.0.1
|
| 276 |
# via dagster
|
| 277 |
# via langchain
|
| 278 |
# via langchain-community
|
| 279 |
# via langchain-core
|
| 280 |
# via uvicorn
|
| 281 |
+
regex==2024.7.24
|
| 282 |
+
# via dateparser
|
| 283 |
# via tiktoken
|
| 284 |
requests==2.32.3
|
| 285 |
# via dagster
|
|
|
|
| 287 |
# via gql
|
| 288 |
# via langchain
|
| 289 |
# via langchain-community
|
| 290 |
+
# via langchainhub
|
| 291 |
# via langsmith
|
| 292 |
# via requests-toolbelt
|
| 293 |
# via semantic-catalogue
|
| 294 |
+
# via sickle
|
| 295 |
# via tiktoken
|
| 296 |
requests-toolbelt==1.0.0
|
| 297 |
# via gql
|
| 298 |
rich==13.7.1
|
| 299 |
# via dagster
|
| 300 |
+
# via typer
|
| 301 |
+
setuptools==72.1.0
|
| 302 |
# via dagster
|
| 303 |
+
shellingham==1.5.4
|
| 304 |
+
# via typer
|
| 305 |
+
sickle==0.7.0
|
| 306 |
+
# via semantic-catalogue
|
| 307 |
six==1.16.0
|
| 308 |
# via asttokens
|
| 309 |
# via python-dateutil
|
|
|
|
| 312 |
# via httpx
|
| 313 |
# via openai
|
| 314 |
sourcery-cli==1.18.0
|
| 315 |
+
sqlalchemy==2.0.31
|
| 316 |
# via alembic
|
| 317 |
# via dagster
|
| 318 |
# via langchain
|
|
|
|
| 322 |
starlette==0.37.2
|
| 323 |
# via dagster-graphql
|
| 324 |
# via dagster-webserver
|
| 325 |
+
# via fastapi
|
| 326 |
+
structlog==24.4.0
|
| 327 |
# via dagster
|
| 328 |
tabulate==0.9.0
|
| 329 |
# via dagster
|
| 330 |
+
tenacity==8.5.0
|
| 331 |
# via langchain
|
| 332 |
# via langchain-community
|
| 333 |
# via langchain-core
|
| 334 |
tiktoken==0.7.0
|
| 335 |
# via langchain-openai
|
| 336 |
+
time-machine==2.14.2
|
| 337 |
# via pendulum
|
| 338 |
tomli==2.0.1
|
| 339 |
# via dagster
|
|
|
|
| 346 |
traitlets==5.14.3
|
| 347 |
# via ipython
|
| 348 |
# via matplotlib-inline
|
| 349 |
+
typer==0.12.3
|
| 350 |
+
# via fastapi-cli
|
| 351 |
+
types-requests==2.32.0.20240712
|
| 352 |
+
# via langchainhub
|
| 353 |
+
typing-extensions==4.12.2
|
| 354 |
# via alembic
|
| 355 |
# via dagster
|
| 356 |
+
# via fastapi
|
| 357 |
+
# via langchain-core
|
| 358 |
# via openai
|
| 359 |
# via pinecone-client
|
| 360 |
# via pydantic
|
| 361 |
# via pydantic-core
|
| 362 |
# via sqlalchemy
|
| 363 |
+
# via typer
|
| 364 |
# via typing-inspect
|
| 365 |
typing-inspect==0.9.0
|
| 366 |
# via dataclasses-json
|
| 367 |
tzdata==2024.1
|
| 368 |
# via pendulum
|
| 369 |
+
tzlocal==5.2
|
| 370 |
+
# via dateparser
|
| 371 |
universal-pathlib==0.2.2
|
| 372 |
# via dagster
|
| 373 |
+
urllib3==2.2.2
|
| 374 |
# via pinecone-client
|
| 375 |
# via requests
|
| 376 |
+
# via types-requests
|
| 377 |
+
uvicorn==0.30.3
|
| 378 |
# via dagster-webserver
|
| 379 |
+
# via fastapi
|
| 380 |
+
# via fastapi-cli
|
| 381 |
uvloop==0.19.0
|
| 382 |
# via uvicorn
|
| 383 |
watchdog==4.0.1
|
requirements.lock
CHANGED
|
@@ -7,14 +7,16 @@
|
|
| 7 |
# all-features: false
|
| 8 |
# with-sources: false
|
| 9 |
# generate-hashes: false
|
|
|
|
| 10 |
|
| 11 |
-e file:.
|
| 12 |
aiohttp==3.9.5
|
| 13 |
# via langchain
|
| 14 |
# via langchain-community
|
|
|
|
| 15 |
aiosignal==1.3.1
|
| 16 |
# via aiohttp
|
| 17 |
-
alembic==1.13.
|
| 18 |
# via dagster
|
| 19 |
aniso8601==9.0.1
|
| 20 |
# via graphene
|
|
@@ -30,46 +32,65 @@ attrs==23.2.0
|
|
| 30 |
# via aiohttp
|
| 31 |
backoff==2.2.1
|
| 32 |
# via gql
|
| 33 |
-
certifi==2024.
|
| 34 |
# via httpcore
|
| 35 |
# via httpx
|
| 36 |
# via pinecone-client
|
| 37 |
# via requests
|
|
|
|
|
|
|
| 38 |
charset-normalizer==3.3.2
|
|
|
|
| 39 |
# via requests
|
| 40 |
click==8.1.7
|
| 41 |
# via dagster
|
| 42 |
# via dagster-webserver
|
|
|
|
| 43 |
# via uvicorn
|
| 44 |
coloredlogs==14.0
|
| 45 |
# via dagster
|
| 46 |
-
croniter==
|
| 47 |
# via dagster
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
# via dagster-graphql
|
| 50 |
# via dagster-openai
|
|
|
|
| 51 |
# via dagster-webserver
|
| 52 |
# via semantic-catalogue
|
| 53 |
-
dagster-graphql==1.7.
|
| 54 |
# via dagster-webserver
|
| 55 |
-
dagster-openai==0.23.
|
| 56 |
# via semantic-catalogue
|
| 57 |
-
dagster-pipes==1.7.
|
| 58 |
# via dagster
|
| 59 |
-
dagster-
|
|
|
|
|
|
|
| 60 |
# via semantic-catalogue
|
| 61 |
-
dataclasses-json==0.6.
|
| 62 |
# via langchain-community
|
|
|
|
|
|
|
| 63 |
distro==1.9.0
|
| 64 |
# via openai
|
|
|
|
|
|
|
| 65 |
docstring-parser==0.16
|
| 66 |
# via dagster
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# via dagster
|
| 69 |
frozenlist==1.4.1
|
| 70 |
# via aiohttp
|
| 71 |
# via aiosignal
|
| 72 |
-
fsspec==2024.
|
| 73 |
# via universal-pathlib
|
| 74 |
gql==3.5.0
|
| 75 |
# via dagster-graphql
|
|
@@ -83,7 +104,7 @@ graphql-relay==3.2.0
|
|
| 83 |
# via graphene
|
| 84 |
greenlet==3.0.3
|
| 85 |
# via sqlalchemy
|
| 86 |
-
grpcio==1.64.
|
| 87 |
# via dagster
|
| 88 |
# via grpcio-health-checking
|
| 89 |
grpcio-health-checking==1.62.2
|
|
@@ -96,41 +117,56 @@ httpcore==1.0.5
|
|
| 96 |
httptools==0.6.1
|
| 97 |
# via uvicorn
|
| 98 |
httpx==0.27.0
|
|
|
|
| 99 |
# via openai
|
| 100 |
humanfriendly==10.0
|
| 101 |
# via coloredlogs
|
| 102 |
idna==3.7
|
| 103 |
# via anyio
|
|
|
|
| 104 |
# via httpx
|
| 105 |
# via requests
|
| 106 |
# via yarl
|
| 107 |
jinja2==3.1.4
|
| 108 |
# via dagster
|
|
|
|
| 109 |
jsonpatch==1.33
|
| 110 |
# via langchain-core
|
| 111 |
-
jsonpointer==
|
| 112 |
# via jsonpatch
|
| 113 |
-
langchain==0.2.
|
| 114 |
# via langchain-community
|
| 115 |
# via semantic-catalogue
|
| 116 |
-
langchain-community==0.2.
|
|
|
|
| 117 |
# via semantic-catalogue
|
| 118 |
-
langchain-core==0.2.
|
| 119 |
# via langchain
|
| 120 |
# via langchain-community
|
|
|
|
| 121 |
# via langchain-openai
|
| 122 |
# via langchain-pinecone
|
| 123 |
# via langchain-text-splitters
|
| 124 |
-
|
|
|
|
| 125 |
# via semantic-catalogue
|
| 126 |
-
langchain-
|
| 127 |
# via semantic-catalogue
|
| 128 |
-
langchain-
|
|
|
|
|
|
|
| 129 |
# via langchain
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
# via langchain
|
| 132 |
# via langchain-community
|
| 133 |
# via langchain-core
|
|
|
|
|
|
|
|
|
|
| 134 |
mako==1.3.5
|
| 135 |
# via alembic
|
| 136 |
markdown-it-py==3.0.0
|
|
@@ -138,7 +174,7 @@ markdown-it-py==3.0.0
|
|
| 138 |
markupsafe==2.1.5
|
| 139 |
# via jinja2
|
| 140 |
# via mako
|
| 141 |
-
marshmallow==3.21.
|
| 142 |
# via dataclasses-json
|
| 143 |
mdurl==0.1.2
|
| 144 |
# via markdown-it-py
|
|
@@ -151,53 +187,74 @@ numpy==1.26.4
|
|
| 151 |
# via langchain
|
| 152 |
# via langchain-community
|
| 153 |
# via langchain-pinecone
|
| 154 |
-
openai==1.
|
| 155 |
# via dagster-openai
|
| 156 |
# via langchain-openai
|
| 157 |
-
orjson==3.10.
|
| 158 |
# via langsmith
|
| 159 |
-
packaging==
|
| 160 |
# via dagster
|
| 161 |
# via langchain-core
|
|
|
|
| 162 |
# via marshmallow
|
|
|
|
|
|
|
| 163 |
pendulum==3.0.0
|
| 164 |
# via dagster
|
| 165 |
-
pinecone-client==
|
| 166 |
# via langchain-pinecone
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
# via semantic-catalogue
|
| 169 |
-
protobuf==4.25.
|
| 170 |
# via dagster
|
| 171 |
# via grpcio-health-checking
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
# via dagster
|
|
|
|
| 174 |
# via langchain
|
| 175 |
# via langchain-core
|
| 176 |
# via langsmith
|
| 177 |
# via openai
|
| 178 |
-
pydantic-
|
|
|
|
| 179 |
# via pydantic
|
|
|
|
|
|
|
| 180 |
pygments==2.18.0
|
| 181 |
# via rich
|
| 182 |
python-dateutil==2.9.0.post0
|
| 183 |
# via croniter
|
| 184 |
-
# via
|
| 185 |
# via pendulum
|
| 186 |
# via time-machine
|
| 187 |
python-dotenv==1.0.1
|
| 188 |
# via dagster
|
|
|
|
| 189 |
# via semantic-catalogue
|
| 190 |
# via uvicorn
|
|
|
|
|
|
|
| 191 |
pytz==2024.1
|
| 192 |
# via croniter
|
| 193 |
# via dagster
|
|
|
|
| 194 |
pyyaml==6.0.1
|
| 195 |
# via dagster
|
| 196 |
# via langchain
|
| 197 |
# via langchain-community
|
| 198 |
# via langchain-core
|
| 199 |
# via uvicorn
|
| 200 |
-
regex==2024.
|
|
|
|
| 201 |
# via tiktoken
|
| 202 |
requests==2.32.3
|
| 203 |
# via dagster
|
|
@@ -205,23 +262,30 @@ requests==2.32.3
|
|
| 205 |
# via gql
|
| 206 |
# via langchain
|
| 207 |
# via langchain-community
|
|
|
|
| 208 |
# via langsmith
|
| 209 |
# via requests-toolbelt
|
| 210 |
# via semantic-catalogue
|
|
|
|
| 211 |
# via tiktoken
|
| 212 |
requests-toolbelt==1.0.0
|
| 213 |
# via gql
|
| 214 |
rich==13.7.1
|
| 215 |
# via dagster
|
| 216 |
-
|
|
|
|
| 217 |
# via dagster
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
six==1.16.0
|
| 219 |
# via python-dateutil
|
| 220 |
sniffio==1.3.1
|
| 221 |
# via anyio
|
| 222 |
# via httpx
|
| 223 |
# via openai
|
| 224 |
-
sqlalchemy==2.0.
|
| 225 |
# via alembic
|
| 226 |
# via dagster
|
| 227 |
# via langchain
|
|
@@ -229,17 +293,18 @@ sqlalchemy==2.0.30
|
|
| 229 |
starlette==0.37.2
|
| 230 |
# via dagster-graphql
|
| 231 |
# via dagster-webserver
|
| 232 |
-
|
|
|
|
| 233 |
# via dagster
|
| 234 |
tabulate==0.9.0
|
| 235 |
# via dagster
|
| 236 |
-
tenacity==8.
|
| 237 |
# via langchain
|
| 238 |
# via langchain-community
|
| 239 |
# via langchain-core
|
| 240 |
tiktoken==0.7.0
|
| 241 |
# via langchain-openai
|
| 242 |
-
time-machine==2.14.
|
| 243 |
# via pendulum
|
| 244 |
tomli==2.0.1
|
| 245 |
# via dagster
|
|
@@ -249,26 +314,38 @@ tqdm==4.66.4
|
|
| 249 |
# via dagster
|
| 250 |
# via openai
|
| 251 |
# via pinecone-client
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
# via alembic
|
| 254 |
# via dagster
|
|
|
|
|
|
|
| 255 |
# via openai
|
| 256 |
# via pinecone-client
|
| 257 |
# via pydantic
|
| 258 |
# via pydantic-core
|
| 259 |
# via sqlalchemy
|
|
|
|
| 260 |
# via typing-inspect
|
| 261 |
typing-inspect==0.9.0
|
| 262 |
# via dataclasses-json
|
| 263 |
tzdata==2024.1
|
| 264 |
# via pendulum
|
|
|
|
|
|
|
| 265 |
universal-pathlib==0.2.2
|
| 266 |
# via dagster
|
| 267 |
-
urllib3==2.2.
|
| 268 |
# via pinecone-client
|
| 269 |
# via requests
|
| 270 |
-
|
|
|
|
| 271 |
# via dagster-webserver
|
|
|
|
|
|
|
| 272 |
uvloop==0.19.0
|
| 273 |
# via uvicorn
|
| 274 |
watchdog==4.0.1
|
|
|
|
| 7 |
# all-features: false
|
| 8 |
# with-sources: false
|
| 9 |
# generate-hashes: false
|
| 10 |
+
# universal: false
|
| 11 |
|
| 12 |
-e file:.
|
| 13 |
aiohttp==3.9.5
|
| 14 |
# via langchain
|
| 15 |
# via langchain-community
|
| 16 |
+
# via langchain-pinecone
|
| 17 |
aiosignal==1.3.1
|
| 18 |
# via aiohttp
|
| 19 |
+
alembic==1.13.2
|
| 20 |
# via dagster
|
| 21 |
aniso8601==9.0.1
|
| 22 |
# via graphene
|
|
|
|
| 32 |
# via aiohttp
|
| 33 |
backoff==2.2.1
|
| 34 |
# via gql
|
| 35 |
+
certifi==2024.7.4
|
| 36 |
# via httpcore
|
| 37 |
# via httpx
|
| 38 |
# via pinecone-client
|
| 39 |
# via requests
|
| 40 |
+
cffi==1.16.0
|
| 41 |
+
# via cryptography
|
| 42 |
charset-normalizer==3.3.2
|
| 43 |
+
# via pdfminer-six
|
| 44 |
# via requests
|
| 45 |
click==8.1.7
|
| 46 |
# via dagster
|
| 47 |
# via dagster-webserver
|
| 48 |
+
# via typer
|
| 49 |
# via uvicorn
|
| 50 |
coloredlogs==14.0
|
| 51 |
# via dagster
|
| 52 |
+
croniter==3.0.3
|
| 53 |
# via dagster
|
| 54 |
+
cryptography==43.0.0
|
| 55 |
+
# via pdfminer-six
|
| 56 |
+
dagster==1.7.15
|
| 57 |
# via dagster-graphql
|
| 58 |
# via dagster-openai
|
| 59 |
+
# via dagster-postgres
|
| 60 |
# via dagster-webserver
|
| 61 |
# via semantic-catalogue
|
| 62 |
+
dagster-graphql==1.7.15
|
| 63 |
# via dagster-webserver
|
| 64 |
+
dagster-openai==0.23.15
|
| 65 |
# via semantic-catalogue
|
| 66 |
+
dagster-pipes==1.7.15
|
| 67 |
# via dagster
|
| 68 |
+
dagster-postgres==0.23.15
|
| 69 |
+
# via semantic-catalogue
|
| 70 |
+
dagster-webserver==1.7.15
|
| 71 |
# via semantic-catalogue
|
| 72 |
+
dataclasses-json==0.6.7
|
| 73 |
# via langchain-community
|
| 74 |
+
dateparser==1.2.0
|
| 75 |
+
# via semantic-catalogue
|
| 76 |
distro==1.9.0
|
| 77 |
# via openai
|
| 78 |
+
dnspython==2.6.1
|
| 79 |
+
# via email-validator
|
| 80 |
docstring-parser==0.16
|
| 81 |
# via dagster
|
| 82 |
+
email-validator==2.2.0
|
| 83 |
+
# via fastapi
|
| 84 |
+
fastapi==0.112.0
|
| 85 |
+
# via semantic-catalogue
|
| 86 |
+
fastapi-cli==0.0.5
|
| 87 |
+
# via fastapi
|
| 88 |
+
filelock==3.15.4
|
| 89 |
# via dagster
|
| 90 |
frozenlist==1.4.1
|
| 91 |
# via aiohttp
|
| 92 |
# via aiosignal
|
| 93 |
+
fsspec==2024.6.1
|
| 94 |
# via universal-pathlib
|
| 95 |
gql==3.5.0
|
| 96 |
# via dagster-graphql
|
|
|
|
| 104 |
# via graphene
|
| 105 |
greenlet==3.0.3
|
| 106 |
# via sqlalchemy
|
| 107 |
+
grpcio==1.64.1
|
| 108 |
# via dagster
|
| 109 |
# via grpcio-health-checking
|
| 110 |
grpcio-health-checking==1.62.2
|
|
|
|
| 117 |
httptools==0.6.1
|
| 118 |
# via uvicorn
|
| 119 |
httpx==0.27.0
|
| 120 |
+
# via fastapi
|
| 121 |
# via openai
|
| 122 |
humanfriendly==10.0
|
| 123 |
# via coloredlogs
|
| 124 |
idna==3.7
|
| 125 |
# via anyio
|
| 126 |
+
# via email-validator
|
| 127 |
# via httpx
|
| 128 |
# via requests
|
| 129 |
# via yarl
|
| 130 |
jinja2==3.1.4
|
| 131 |
# via dagster
|
| 132 |
+
# via fastapi
|
| 133 |
jsonpatch==1.33
|
| 134 |
# via langchain-core
|
| 135 |
+
jsonpointer==3.0.0
|
| 136 |
# via jsonpatch
|
| 137 |
+
langchain==0.2.12
|
| 138 |
# via langchain-community
|
| 139 |
# via semantic-catalogue
|
| 140 |
+
langchain-community==0.2.10
|
| 141 |
+
# via langchain-experimental
|
| 142 |
# via semantic-catalogue
|
| 143 |
+
langchain-core==0.2.27
|
| 144 |
# via langchain
|
| 145 |
# via langchain-community
|
| 146 |
+
# via langchain-experimental
|
| 147 |
# via langchain-openai
|
| 148 |
# via langchain-pinecone
|
| 149 |
# via langchain-text-splitters
|
| 150 |
+
# via langgraph
|
| 151 |
+
langchain-experimental==0.0.63
|
| 152 |
# via semantic-catalogue
|
| 153 |
+
langchain-openai==0.1.20
|
| 154 |
# via semantic-catalogue
|
| 155 |
+
langchain-pinecone==0.1.3
|
| 156 |
+
# via semantic-catalogue
|
| 157 |
+
langchain-text-splitters==0.2.2
|
| 158 |
# via langchain
|
| 159 |
+
langchainhub==0.1.20
|
| 160 |
+
# via semantic-catalogue
|
| 161 |
+
langgraph==0.1.19
|
| 162 |
+
# via semantic-catalogue
|
| 163 |
+
langsmith==0.1.96
|
| 164 |
# via langchain
|
| 165 |
# via langchain-community
|
| 166 |
# via langchain-core
|
| 167 |
+
lxml==5.2.2
|
| 168 |
+
# via semantic-catalogue
|
| 169 |
+
# via sickle
|
| 170 |
mako==1.3.5
|
| 171 |
# via alembic
|
| 172 |
markdown-it-py==3.0.0
|
|
|
|
| 174 |
markupsafe==2.1.5
|
| 175 |
# via jinja2
|
| 176 |
# via mako
|
| 177 |
+
marshmallow==3.21.3
|
| 178 |
# via dataclasses-json
|
| 179 |
mdurl==0.1.2
|
| 180 |
# via markdown-it-py
|
|
|
|
| 187 |
# via langchain
|
| 188 |
# via langchain-community
|
| 189 |
# via langchain-pinecone
|
| 190 |
+
openai==1.37.1
|
| 191 |
# via dagster-openai
|
| 192 |
# via langchain-openai
|
| 193 |
+
orjson==3.10.6
|
| 194 |
# via langsmith
|
| 195 |
+
packaging==24.1
|
| 196 |
# via dagster
|
| 197 |
# via langchain-core
|
| 198 |
+
# via langchainhub
|
| 199 |
# via marshmallow
|
| 200 |
+
pdfminer-six==20240706
|
| 201 |
+
# via semantic-catalogue
|
| 202 |
pendulum==3.0.0
|
| 203 |
# via dagster
|
| 204 |
+
pinecone-client==5.0.1
|
| 205 |
# via langchain-pinecone
|
| 206 |
+
pinecone-plugin-inference==1.0.3
|
| 207 |
+
# via pinecone-client
|
| 208 |
+
pinecone-plugin-interface==0.0.7
|
| 209 |
+
# via pinecone-client
|
| 210 |
+
# via pinecone-plugin-inference
|
| 211 |
+
polars==1.3.0
|
| 212 |
# via semantic-catalogue
|
| 213 |
+
protobuf==4.25.4
|
| 214 |
# via dagster
|
| 215 |
# via grpcio-health-checking
|
| 216 |
+
psycopg2-binary==2.9.9
|
| 217 |
+
# via dagster-postgres
|
| 218 |
+
pycparser==2.22
|
| 219 |
+
# via cffi
|
| 220 |
+
pydantic==2.8.2
|
| 221 |
# via dagster
|
| 222 |
+
# via fastapi
|
| 223 |
# via langchain
|
| 224 |
# via langchain-core
|
| 225 |
# via langsmith
|
| 226 |
# via openai
|
| 227 |
+
# via pydantic-settings
|
| 228 |
+
pydantic-core==2.20.1
|
| 229 |
# via pydantic
|
| 230 |
+
pydantic-settings==2.3.4
|
| 231 |
+
# via semantic-catalogue
|
| 232 |
pygments==2.18.0
|
| 233 |
# via rich
|
| 234 |
python-dateutil==2.9.0.post0
|
| 235 |
# via croniter
|
| 236 |
+
# via dateparser
|
| 237 |
# via pendulum
|
| 238 |
# via time-machine
|
| 239 |
python-dotenv==1.0.1
|
| 240 |
# via dagster
|
| 241 |
+
# via pydantic-settings
|
| 242 |
# via semantic-catalogue
|
| 243 |
# via uvicorn
|
| 244 |
+
python-multipart==0.0.9
|
| 245 |
+
# via fastapi
|
| 246 |
pytz==2024.1
|
| 247 |
# via croniter
|
| 248 |
# via dagster
|
| 249 |
+
# via dateparser
|
| 250 |
pyyaml==6.0.1
|
| 251 |
# via dagster
|
| 252 |
# via langchain
|
| 253 |
# via langchain-community
|
| 254 |
# via langchain-core
|
| 255 |
# via uvicorn
|
| 256 |
+
regex==2024.7.24
|
| 257 |
+
# via dateparser
|
| 258 |
# via tiktoken
|
| 259 |
requests==2.32.3
|
| 260 |
# via dagster
|
|
|
|
| 262 |
# via gql
|
| 263 |
# via langchain
|
| 264 |
# via langchain-community
|
| 265 |
+
# via langchainhub
|
| 266 |
# via langsmith
|
| 267 |
# via requests-toolbelt
|
| 268 |
# via semantic-catalogue
|
| 269 |
+
# via sickle
|
| 270 |
# via tiktoken
|
| 271 |
requests-toolbelt==1.0.0
|
| 272 |
# via gql
|
| 273 |
rich==13.7.1
|
| 274 |
# via dagster
|
| 275 |
+
# via typer
|
| 276 |
+
setuptools==72.1.0
|
| 277 |
# via dagster
|
| 278 |
+
shellingham==1.5.4
|
| 279 |
+
# via typer
|
| 280 |
+
sickle==0.7.0
|
| 281 |
+
# via semantic-catalogue
|
| 282 |
six==1.16.0
|
| 283 |
# via python-dateutil
|
| 284 |
sniffio==1.3.1
|
| 285 |
# via anyio
|
| 286 |
# via httpx
|
| 287 |
# via openai
|
| 288 |
+
sqlalchemy==2.0.31
|
| 289 |
# via alembic
|
| 290 |
# via dagster
|
| 291 |
# via langchain
|
|
|
|
| 293 |
starlette==0.37.2
|
| 294 |
# via dagster-graphql
|
| 295 |
# via dagster-webserver
|
| 296 |
+
# via fastapi
|
| 297 |
+
structlog==24.4.0
|
| 298 |
# via dagster
|
| 299 |
tabulate==0.9.0
|
| 300 |
# via dagster
|
| 301 |
+
tenacity==8.5.0
|
| 302 |
# via langchain
|
| 303 |
# via langchain-community
|
| 304 |
# via langchain-core
|
| 305 |
tiktoken==0.7.0
|
| 306 |
# via langchain-openai
|
| 307 |
+
time-machine==2.14.2
|
| 308 |
# via pendulum
|
| 309 |
tomli==2.0.1
|
| 310 |
# via dagster
|
|
|
|
| 314 |
# via dagster
|
| 315 |
# via openai
|
| 316 |
# via pinecone-client
|
| 317 |
+
typer==0.12.3
|
| 318 |
+
# via fastapi-cli
|
| 319 |
+
types-requests==2.32.0.20240712
|
| 320 |
+
# via langchainhub
|
| 321 |
+
typing-extensions==4.12.2
|
| 322 |
# via alembic
|
| 323 |
# via dagster
|
| 324 |
+
# via fastapi
|
| 325 |
+
# via langchain-core
|
| 326 |
# via openai
|
| 327 |
# via pinecone-client
|
| 328 |
# via pydantic
|
| 329 |
# via pydantic-core
|
| 330 |
# via sqlalchemy
|
| 331 |
+
# via typer
|
| 332 |
# via typing-inspect
|
| 333 |
typing-inspect==0.9.0
|
| 334 |
# via dataclasses-json
|
| 335 |
tzdata==2024.1
|
| 336 |
# via pendulum
|
| 337 |
+
tzlocal==5.2
|
| 338 |
+
# via dateparser
|
| 339 |
universal-pathlib==0.2.2
|
| 340 |
# via dagster
|
| 341 |
+
urllib3==2.2.2
|
| 342 |
# via pinecone-client
|
| 343 |
# via requests
|
| 344 |
+
# via types-requests
|
| 345 |
+
uvicorn==0.30.3
|
| 346 |
# via dagster-webserver
|
| 347 |
+
# via fastapi
|
| 348 |
+
# via fastapi-cli
|
| 349 |
uvloop==0.19.0
|
| 350 |
# via uvicorn
|
| 351 |
watchdog==4.0.1
|
src/__init__.py
CHANGED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from dagster import Definitions, load_assets_from_modules
|
| 2 |
-
|
| 3 |
-
from src.assets import adr, datastore
|
| 4 |
-
from src.jobs import adr_job
|
| 5 |
-
from src.resources import openai_resource
|
| 6 |
-
|
| 7 |
-
adr_assets = load_assets_from_modules(modules=[adr], group_name="adr_assets")
|
| 8 |
-
datastore_assets = load_assets_from_modules(modules=[datastore], group_name="datastore")
|
| 9 |
-
|
| 10 |
-
defs = Definitions(
|
| 11 |
-
assets=[*adr_assets, *datastore_assets],
|
| 12 |
-
jobs=[adr_job],
|
| 13 |
-
resources={"openai": openai_resource},
|
| 14 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/assets/datastore.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
from dagster import AssetExecutionContext, asset
|
| 2 |
-
from dagster_openai import OpenAIResource
|
| 3 |
-
from dotenv import load_dotenv
|
| 4 |
-
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 5 |
-
from langchain_openai import OpenAIEmbeddings
|
| 6 |
-
from langchain_pinecone import PineconeVectorStore
|
| 7 |
-
from langchain_text_splitters import CharacterTextSplitter
|
| 8 |
-
from pinecone import Pinecone, ServerlessSpec
|
| 9 |
-
|
| 10 |
-
from src.common.utils import Consts, Paths
|
| 11 |
-
|
| 12 |
-
load_dotenv()
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
@asset(compute_kind="Pinecone")
|
| 16 |
-
def pinecone_index(context: AssetExecutionContext):
|
| 17 |
-
pc = Pinecone()
|
| 18 |
-
if Consts.INDEX_NAME in [index["name"] for index in pc.list_indexes()]:
|
| 19 |
-
pc.delete_index(Consts.INDEX_NAME)
|
| 20 |
-
|
| 21 |
-
pc.create_index(
|
| 22 |
-
name=Consts.INDEX_NAME,
|
| 23 |
-
dimension=Consts.EMBEDDING_DIM,
|
| 24 |
-
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
| 25 |
-
metric="cosine",
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
@asset(compute_kind="OpenAI", deps=["adr_descriptions", "pinecone_index"])
|
| 30 |
-
def adr_pinecone(context: AssetExecutionContext, openai: OpenAIResource):
|
| 31 |
-
loader = DirectoryLoader(
|
| 32 |
-
Paths.ADR / "descriptions",
|
| 33 |
-
glob="*.txt",
|
| 34 |
-
loader_cls=TextLoader,
|
| 35 |
-
use_multithreading=True,
|
| 36 |
-
show_progress=True,
|
| 37 |
-
)
|
| 38 |
-
documents = loader.load()
|
| 39 |
-
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
|
| 40 |
-
docs = text_splitter.split_documents(documents)
|
| 41 |
-
|
| 42 |
-
with openai.get_client(context) as client:
|
| 43 |
-
embeddings = OpenAIEmbeddings(
|
| 44 |
-
client=client.embeddings,
|
| 45 |
-
model=Consts.EMBEDDING_MODEL,
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
PineconeVectorStore.from_documents(docs, embeddings, index_name=Consts.INDEX_NAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/common/logging.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging.config
|
| 2 |
+
|
| 3 |
+
logger = logging.getLogger("data-catalogue")
|
src/common/settings.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tomllib
|
| 2 |
+
|
| 3 |
+
from pydantic import Field
|
| 4 |
+
from pydantic_settings import BaseSettings
|
| 5 |
+
|
| 6 |
+
with open("./config/config.toml", "rb") as f:
|
| 7 |
+
Config = tomllib.load(f)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DataStoreSettings(BaseSettings):
|
| 11 |
+
index_name: str = Field(min_length=1)
|
| 12 |
+
embed_model: str = Field(min_length=1)
|
| 13 |
+
embed_dim: int = Field(gt=0, le=10_000)
|
| 14 |
+
chunk_size: int = Field(gt=0, le=10_000)
|
| 15 |
+
chunk_overlap: int = Field(ge=0, le=10_000)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ModelSettings(BaseSettings):
|
| 19 |
+
llm: str = Field(min_length=1)
|
| 20 |
+
top_k: int = Field(gt=0, le=100)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Settings(BaseSettings):
|
| 24 |
+
model: ModelSettings = ModelSettings.model_validate(Config["model"])
|
| 25 |
+
datastore: DataStoreSettings = DataStoreSettings.model_validate(Config["datastore"])
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
cfg = Settings()
|
src/common/utils.py
CHANGED
|
@@ -1,12 +1,21 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
class Paths:
|
| 5 |
-
DATA = Path("data")
|
| 6 |
ADR = DATA / "adr"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
-
|
| 10 |
-
INDEX_NAME = "data-catalogue"
|
| 11 |
-
EMBEDDING_MODEL = "text-embedding-3-large"
|
| 12 |
-
EMBEDDING_DIM = 3072
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
|
| 8 |
class Paths:
|
| 9 |
+
DATA: Path = Path("data")
|
| 10 |
ADR = DATA / "adr"
|
| 11 |
+
UKDS = DATA / "ukds"
|
| 12 |
+
CDRC = DATA / "cdrc"
|
| 13 |
+
|
| 14 |
+
@classmethod
|
| 15 |
+
def ensure_directories_exist(cls):
|
| 16 |
+
cls.ADR.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
cls.UKDS.mkdir(parents=True, exist_ok=True)
|
| 18 |
+
cls.CDRC.mkdir(parents=True, exist_ok=True)
|
| 19 |
|
| 20 |
|
| 21 |
+
Paths.ensure_directories_exist()
|
|
|
|
|
|
|
|
|
src/datastore/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dagster import Definitions, load_assets_from_modules
|
| 2 |
+
|
| 3 |
+
from src.datastore.assets import adr, cdrc, datastore, ukds
|
| 4 |
+
from src.datastore.jobs import adr_job, cdrc_job, ukds_job
|
| 5 |
+
from src.datastore.resources import openai_resource
|
| 6 |
+
from src.datastore.schedules import adr_schedule, cdrc_schedule, ukds_schedule
|
| 7 |
+
|
| 8 |
+
adr_assets = load_assets_from_modules(modules=[adr], group_name="adr_assets")
|
| 9 |
+
ukds_assets = load_assets_from_modules(modules=[ukds], group_name="ukds_assets")
|
| 10 |
+
cdrc_assets = load_assets_from_modules(modules=[cdrc], group_name="cdrc_assets")
|
| 11 |
+
datastore_assets = load_assets_from_modules(modules=[datastore], group_name="datastore")
|
| 12 |
+
|
| 13 |
+
defs = Definitions(
|
| 14 |
+
assets=[*ukds_assets, *adr_assets, *cdrc_assets, *datastore_assets],
|
| 15 |
+
jobs=[adr_job, ukds_job, cdrc_job],
|
| 16 |
+
schedules=[adr_schedule, ukds_schedule, cdrc_schedule],
|
| 17 |
+
resources={"openai": openai_resource},
|
| 18 |
+
)
|
src/{assets → datastore/assets}/adr.py
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import itertools
|
| 2 |
import json
|
| 3 |
-
import logging
|
| 4 |
|
| 5 |
import polars as pl
|
| 6 |
import requests
|
| 7 |
from dagster import AssetExecutionContext, asset
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
from src.common.utils import Paths
|
|
@@ -15,7 +15,7 @@ BASE_URL = "https://api-datacatalogue.adruk.org/api"
|
|
| 15 |
|
| 16 |
|
| 17 |
@asset
|
| 18 |
-
def adr_session():
|
| 19 |
session = requests.Session()
|
| 20 |
session.headers.update({"X-API-Version": API_VERSION})
|
| 21 |
return session
|
|
@@ -28,16 +28,26 @@ def adr_datasets_id(
|
|
| 28 |
datasets = []
|
| 29 |
for page_number in itertools.count(start=1):
|
| 30 |
context.log.info(f"Fetching page {page_number}")
|
| 31 |
-
datasets_page = _fetch_datasets_page(adr_session, page_number)
|
| 32 |
-
if
|
|
|
|
| 33 |
break
|
| 34 |
datasets.extend(datasets_page)
|
| 35 |
-
df =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
df.write_parquet(Paths.ADR / "adr_datasets_id.parquet")
|
|
|
|
| 37 |
return df
|
| 38 |
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
params = {
|
| 42 |
"pageSize": PAGE_SIZE,
|
| 43 |
"pageNumber": page_number,
|
|
@@ -50,11 +60,16 @@ def _fetch_datasets_page(adr_session: requests.Session, page_number: int) -> dic
|
|
| 50 |
try:
|
| 51 |
response = adr_session.get(f"{BASE_URL}/{{sql}}/dataset", params=params)
|
| 52 |
response.raise_for_status()
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
except requests.HTTPError as http_err:
|
| 55 |
-
|
|
|
|
| 56 |
except Exception as err:
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
@asset
|
|
@@ -63,19 +78,26 @@ def adr_datasets(
|
|
| 63 |
adr_session: requests.Session,
|
| 64 |
adr_datasets_id: pl.DataFrame,
|
| 65 |
) -> pl.DataFrame:
|
| 66 |
-
df = adr_datasets_id.filter(pl.col("searchResultType") == "PHYSICAL").with_columns(
|
| 67 |
-
pl.col("origin").struct[0].alias("origin_id")
|
| 68 |
-
)
|
| 69 |
|
| 70 |
datasets_list = []
|
| 71 |
-
for row in tqdm(
|
| 72 |
dataset = _fetch_dataset_info(context, adr_session, row)
|
| 73 |
-
|
|
|
|
| 74 |
df = pl.DataFrame(datasets_list)
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
return df
|
| 77 |
|
| 78 |
|
|
|
|
| 79 |
def _fetch_dataset_info(
|
| 80 |
context: AssetExecutionContext, adr_session: requests.Session, row: dict
|
| 81 |
) -> dict:
|
|
@@ -85,8 +107,10 @@ def _fetch_dataset_info(
|
|
| 85 |
response.raise_for_status()
|
| 86 |
except requests.HTTPError as http_err:
|
| 87 |
context.log.error(f"HTTP error occurred: {http_err}")
|
|
|
|
| 88 |
except Exception as err:
|
| 89 |
context.log.error(f"Other error occurred: {err}")
|
|
|
|
| 90 |
content = json.loads(response.content)
|
| 91 |
|
| 92 |
return {
|
|
@@ -105,10 +129,12 @@ def _fetch_dataset_info(
|
|
| 105 |
|
| 106 |
@asset
|
| 107 |
def adr_descriptions(adr_datasets: pl.DataFrame) -> None:
|
|
|
|
|
|
|
|
|
|
| 108 |
for item in adr_datasets.rows(named=True):
|
| 109 |
with open(
|
| 110 |
-
|
| 111 |
-
/ f"descriptions/{item['id']}-{item['origin_id']}-description.txt",
|
| 112 |
"w",
|
| 113 |
) as f:
|
| 114 |
f.write(
|
|
|
|
| 1 |
import itertools
|
| 2 |
import json
|
|
|
|
| 3 |
|
| 4 |
import polars as pl
|
| 5 |
import requests
|
| 6 |
from dagster import AssetExecutionContext, asset
|
| 7 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
from src.common.utils import Paths
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
@asset
|
| 18 |
+
def adr_session() -> requests.Session:
|
| 19 |
session = requests.Session()
|
| 20 |
session.headers.update({"X-API-Version": API_VERSION})
|
| 21 |
return session
|
|
|
|
| 28 |
datasets = []
|
| 29 |
for page_number in itertools.count(start=1):
|
| 30 |
context.log.info(f"Fetching page {page_number}")
|
| 31 |
+
datasets_page = _fetch_datasets_page(context, adr_session, page_number)
|
| 32 |
+
if "end" in datasets_page:
|
| 33 |
+
context.log.info(f"End of pages reached at {datasets_page['end']}")
|
| 34 |
break
|
| 35 |
datasets.extend(datasets_page)
|
| 36 |
+
df = (
|
| 37 |
+
pl.DataFrame(datasets)
|
| 38 |
+
.select(["origin", "id", "searchResultType", "title"])
|
| 39 |
+
.filter(pl.col("searchResultType") == "PHYSICAL")
|
| 40 |
+
.with_columns(pl.col("origin").struct[0].alias("origin_id"))
|
| 41 |
+
)
|
| 42 |
df.write_parquet(Paths.ADR / "adr_datasets_id.parquet")
|
| 43 |
+
|
| 44 |
return df
|
| 45 |
|
| 46 |
|
| 47 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 48 |
+
def _fetch_datasets_page(
|
| 49 |
+
context: AssetExecutionContext, adr_session: requests.Session, page_number: int
|
| 50 |
+
) -> dict:
|
| 51 |
params = {
|
| 52 |
"pageSize": PAGE_SIZE,
|
| 53 |
"pageNumber": page_number,
|
|
|
|
| 60 |
try:
|
| 61 |
response = adr_session.get(f"{BASE_URL}/{{sql}}/dataset", params=params)
|
| 62 |
response.raise_for_status()
|
| 63 |
+
content = json.loads(response.content)["content"]
|
| 64 |
+
if not content:
|
| 65 |
+
return {"end": page_number}
|
| 66 |
+
return content
|
| 67 |
except requests.HTTPError as http_err:
|
| 68 |
+
context.log.error(f"HTTP error occurred: {http_err}")
|
| 69 |
+
return {}
|
| 70 |
except Exception as err:
|
| 71 |
+
context.log.error(f"Other error occurred: {err}")
|
| 72 |
+
return {}
|
| 73 |
|
| 74 |
|
| 75 |
@asset
|
|
|
|
| 78 |
adr_session: requests.Session,
|
| 79 |
adr_datasets_id: pl.DataFrame,
|
| 80 |
) -> pl.DataFrame:
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
datasets_list = []
|
| 83 |
+
for row in tqdm(adr_datasets_id.rows(named=True), total=len(adr_datasets_id)):
|
| 84 |
dataset = _fetch_dataset_info(context, adr_session, row)
|
| 85 |
+
if dataset:
|
| 86 |
+
datasets_list.append(dataset)
|
| 87 |
df = pl.DataFrame(datasets_list)
|
| 88 |
+
(
|
| 89 |
+
df.with_columns(
|
| 90 |
+
pl.col("coverage").struct[0].alias("coverage_0"),
|
| 91 |
+
pl.col("coverage").struct[1].alias("coverage_1"),
|
| 92 |
+
)
|
| 93 |
+
.drop("coverage")
|
| 94 |
+
.write_parquet(Paths.ADR / "adr_datasets.parquet")
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
return df
|
| 98 |
|
| 99 |
|
| 100 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 101 |
def _fetch_dataset_info(
|
| 102 |
context: AssetExecutionContext, adr_session: requests.Session, row: dict
|
| 103 |
) -> dict:
|
|
|
|
| 107 |
response.raise_for_status()
|
| 108 |
except requests.HTTPError as http_err:
|
| 109 |
context.log.error(f"HTTP error occurred: {http_err}")
|
| 110 |
+
return {}
|
| 111 |
except Exception as err:
|
| 112 |
context.log.error(f"Other error occurred: {err}")
|
| 113 |
+
return {}
|
| 114 |
content = json.loads(response.content)
|
| 115 |
|
| 116 |
return {
|
|
|
|
| 129 |
|
| 130 |
@asset
|
| 131 |
def adr_descriptions(adr_datasets: pl.DataFrame) -> None:
|
| 132 |
+
outdir = Paths.ADR / "txt"
|
| 133 |
+
outdir.mkdir(parents=True, exist_ok=True)
|
| 134 |
+
|
| 135 |
for item in adr_datasets.rows(named=True):
|
| 136 |
with open(
|
| 137 |
+
outdir / f"{item['id']}-{item['origin_id']}-description.txt",
|
|
|
|
| 138 |
"w",
|
| 139 |
) as f:
|
| 140 |
f.write(
|
src/datastore/assets/cdrc.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import itertools
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import polars as pl
|
| 6 |
+
import requests
|
| 7 |
+
from dagster import AssetExecutionContext, asset
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
from src.common.utils import Paths
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
METADATA_URL = (
|
| 16 |
+
"https://data.cdrc.ac.uk/api/3/action/current_package_list_with_resources"
|
| 17 |
+
)
|
| 18 |
+
LOGIN_URL = "https://data.cdrc.ac.uk/user/login"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@asset
|
| 22 |
+
def cdrc_metadata(context: AssetExecutionContext) -> list[dict]:
|
| 23 |
+
try:
|
| 24 |
+
r = requests.get(METADATA_URL)
|
| 25 |
+
r.raise_for_status()
|
| 26 |
+
except requests.HTTPError as http_err:
|
| 27 |
+
context.log.error(f"HTTP error occurred: {http_err}")
|
| 28 |
+
raise
|
| 29 |
+
except Exception as err:
|
| 30 |
+
context.log.error(f"Other error occurred: {err}")
|
| 31 |
+
raise
|
| 32 |
+
catalogue_metadata = r.json()["result"][0]
|
| 33 |
+
return catalogue_metadata
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@asset
|
| 37 |
+
def cdrc_notes(cdrc_metadata: list[dict]):
|
| 38 |
+
outdir = Paths.CDRC / "txt"
|
| 39 |
+
outdir.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
df = pl.DataFrame(cdrc_metadata).drop(["resources", "tags", "extras"])
|
| 42 |
+
df.write_parquet(Paths.CDRC / "cdrc_metadata.parquet")
|
| 43 |
+
|
| 44 |
+
for item in df.rows(named=True):
|
| 45 |
+
with open(Paths.CDRC / "txt" / f"{item['id']}-notes.txt", "w") as f:
|
| 46 |
+
f.write(
|
| 47 |
+
f"Dataset Title: {item['title']}"
|
| 48 |
+
"\n\nDescription: \n\n"
|
| 49 |
+
f"{re.sub('<[^<]+?>','', item['notes'])}"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@asset
|
| 54 |
+
def cdrc_resources(cdrc_metadata: list[dict]) -> pl.DataFrame:
|
| 55 |
+
resources = list(
|
| 56 |
+
itertools.chain.from_iterable(
|
| 57 |
+
[item["resources"] if "resources" in item else [] for item in cdrc_metadata]
|
| 58 |
+
)
|
| 59 |
+
)
|
| 60 |
+
df = pl.concat(
|
| 61 |
+
[
|
| 62 |
+
pl.DataFrame(cdrc_metadata).explode("resources").drop("resources"),
|
| 63 |
+
pl.DataFrame(resources)
|
| 64 |
+
.rename(
|
| 65 |
+
{"id": "resource_id", "url": "resource_url", "name": "resource_name"},
|
| 66 |
+
)
|
| 67 |
+
.drop(["state", "revision_timestamp"]),
|
| 68 |
+
],
|
| 69 |
+
how="horizontal",
|
| 70 |
+
).filter((pl.col("format") == "pdf") & (pl.col("resource_url") != ""))
|
| 71 |
+
df.write_parquet(Paths.CDRC / "cdrc_resource_metadata.parquet")
|
| 72 |
+
return df
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@asset
|
| 76 |
+
def cdrc_session() -> requests.Session:
|
| 77 |
+
session = requests.Session()
|
| 78 |
+
session.post(
|
| 79 |
+
LOGIN_URL,
|
| 80 |
+
data={
|
| 81 |
+
"name": os.getenv("CDRC_USERNAME"),
|
| 82 |
+
"pass": os.getenv("CDRC_PASSWORD"),
|
| 83 |
+
"form_build_id": os.getenv("CDRC_FORM_BUILD_ID"),
|
| 84 |
+
"form_id": "user_login",
|
| 85 |
+
"op": "Log in",
|
| 86 |
+
},
|
| 87 |
+
)
|
| 88 |
+
return session
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@asset
|
| 92 |
+
def cdrc_pdfs(
|
| 93 |
+
context: AssetExecutionContext,
|
| 94 |
+
cdrc_session: requests.Session,
|
| 95 |
+
cdrc_resources: pl.DataFrame,
|
| 96 |
+
):
|
| 97 |
+
outdir = Paths.CDRC / "pdf"
|
| 98 |
+
outdir.mkdir(parents=True, exist_ok=True)
|
| 99 |
+
|
| 100 |
+
for item in tqdm(cdrc_resources.rows(named=True)):
|
| 101 |
+
context.log.info(f"Processing {item['resource_url']}...")
|
| 102 |
+
filepath = outdir / f"{item['id']}-{item['resource_id']}.pdf"
|
| 103 |
+
try:
|
| 104 |
+
file = cdrc_session.get(item["resource_url"])
|
| 105 |
+
file.raise_for_status()
|
| 106 |
+
except requests.HTTPError as http_err:
|
| 107 |
+
context.log.error(f"HTTP error occurred: {http_err}")
|
| 108 |
+
continue
|
| 109 |
+
except Exception as err:
|
| 110 |
+
context.log.error(f"Other error occurred: {err}")
|
| 111 |
+
continue
|
| 112 |
+
with open(filepath, "wb") as f:
|
| 113 |
+
f.write(file.content)
|
src/datastore/assets/datastore.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from dagster import (
|
| 5 |
+
AssetExecutionContext,
|
| 6 |
+
AutoMaterializePolicy,
|
| 7 |
+
AutoMaterializeRule,
|
| 8 |
+
asset,
|
| 9 |
+
)
|
| 10 |
+
from dagster_openai import OpenAIResource
|
| 11 |
+
from langchain_community.document_loaders import DirectoryLoader
|
| 12 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
| 13 |
+
from langchain_openai import OpenAIEmbeddings
|
| 14 |
+
from langchain_pinecone import PineconeVectorStore
|
| 15 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 16 |
+
|
| 17 |
+
from src.common.settings import cfg
|
| 18 |
+
from src.common.utils import Paths
|
| 19 |
+
from src.datastore.loaders import ADRLoader, CDRCLoader, UKDSLoader
|
| 20 |
+
|
| 21 |
+
wait_on_all_parents_policy = AutoMaterializePolicy.eager().with_rules(
|
| 22 |
+
AutoMaterializeRule.skip_on_not_all_parents_updated()
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _process_documents(
|
| 27 |
+
context: AssetExecutionContext,
|
| 28 |
+
openai: OpenAIResource,
|
| 29 |
+
paths: list[Path],
|
| 30 |
+
glob_patterns: list[str],
|
| 31 |
+
loader_classes: list[type],
|
| 32 |
+
):
|
| 33 |
+
documents = []
|
| 34 |
+
for path, glob_pattern, loader_cls in zip(paths, glob_patterns, loader_classes):
|
| 35 |
+
loader = DirectoryLoader(
|
| 36 |
+
str(path),
|
| 37 |
+
glob=glob_pattern,
|
| 38 |
+
loader_cls=loader_cls,
|
| 39 |
+
use_multithreading=True,
|
| 40 |
+
show_progress=True,
|
| 41 |
+
)
|
| 42 |
+
documents.extend(loader.load())
|
| 43 |
+
|
| 44 |
+
with openai.get_client(context) as client:
|
| 45 |
+
embeddings = OpenAIEmbeddings(
|
| 46 |
+
client=client.embeddings, model=cfg.datastore.embed_model
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
text_splitter = SemanticChunker(
|
| 50 |
+
embeddings=embeddings, breakpoint_threshold_type="percentile"
|
| 51 |
+
)
|
| 52 |
+
docs = text_splitter.split_documents(documents)
|
| 53 |
+
|
| 54 |
+
vectorstore = PineconeVectorStore(
|
| 55 |
+
index_name=cfg.datastore.index_name, embedding=embeddings
|
| 56 |
+
)
|
| 57 |
+
vectorstore.add_documents(documents=docs)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@asset(
|
| 61 |
+
compute_kind="Pinecone",
|
| 62 |
+
deps=["adr_descriptions", "ukds_abstracts", "cdrc_notes", "cdrc_pdfs"],
|
| 63 |
+
auto_materialize_policy=wait_on_all_parents_policy,
|
| 64 |
+
)
|
| 65 |
+
def pinecone_index(context: AssetExecutionContext, openai: OpenAIResource):
|
| 66 |
+
pc = Pinecone()
|
| 67 |
+
if cfg.datastore.index_name in [index["name"] for index in pc.list_indexes()]:
|
| 68 |
+
pc.delete_index(cfg.datastore.index_name)
|
| 69 |
+
|
| 70 |
+
pc.create_index(
|
| 71 |
+
name=cfg.datastore.index_name,
|
| 72 |
+
dimension=cfg.datastore.embed_dim,
|
| 73 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
| 74 |
+
metric="cosine",
|
| 75 |
+
)
|
| 76 |
+
while not pc.describe_index(cfg.datastore.index_name).status["ready"]:
|
| 77 |
+
time.sleep(1)
|
| 78 |
+
|
| 79 |
+
_process_documents(
|
| 80 |
+
context,
|
| 81 |
+
openai,
|
| 82 |
+
paths=[Paths.ADR / "txt"],
|
| 83 |
+
glob_patterns=["*.txt"],
|
| 84 |
+
loader_classes=[ADRLoader],
|
| 85 |
+
)
|
| 86 |
+
_process_documents(
|
| 87 |
+
context,
|
| 88 |
+
openai,
|
| 89 |
+
paths=[Paths.CDRC / "txt", Paths.CDRC / "pdf"],
|
| 90 |
+
glob_patterns=["*.txt", "*.pdf"],
|
| 91 |
+
loader_classes=[CDRCLoader, CDRCLoader],
|
| 92 |
+
)
|
| 93 |
+
_process_documents(
|
| 94 |
+
context,
|
| 95 |
+
openai,
|
| 96 |
+
paths=[Paths.UKDS / "txt"],
|
| 97 |
+
loader_classes=[UKDSLoader],
|
| 98 |
+
glob_patterns=["*.txt"],
|
| 99 |
+
)
|
src/datastore/assets/ukds.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import xml.etree.ElementTree as ET
|
| 3 |
+
|
| 4 |
+
import polars as pl
|
| 5 |
+
import requests
|
| 6 |
+
from dagster import AssetExecutionContext, asset
|
| 7 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
|
| 10 |
+
from src.common.utils import Paths
|
| 11 |
+
|
| 12 |
+
BASE_URL = "https://oai.ukdataservice.ac.uk:8443/oai/provider"
|
| 13 |
+
PARAMS = {"verb": "ListIdentifiers", "metadataPrefix": "ddi", "set": "DataCollections"}
|
| 14 |
+
NAMESPACES = {"oai": "http://www.openarchives.org/OAI/2.0/", "ns2": "ddi:codebook:2_5"}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@asset
|
| 18 |
+
def ukds_identifiers() -> list[str]:
|
| 19 |
+
params = PARAMS.copy()
|
| 20 |
+
|
| 21 |
+
identifiers = []
|
| 22 |
+
while True:
|
| 23 |
+
response = requests.get(BASE_URL, params=params)
|
| 24 |
+
if response.status_code != 200:
|
| 25 |
+
print(f"Failed to fetch data. Status code: {response.status_code}")
|
| 26 |
+
break
|
| 27 |
+
|
| 28 |
+
root = ET.fromstring(response.content)
|
| 29 |
+
|
| 30 |
+
headers = root.findall(".//oai:header", NAMESPACES)
|
| 31 |
+
for header in headers:
|
| 32 |
+
if header.attrib.get("status") != "deleted":
|
| 33 |
+
identifier = header.find(".//oai:identifier", NAMESPACES)
|
| 34 |
+
if identifier is not None:
|
| 35 |
+
identifiers.append(identifier.text)
|
| 36 |
+
|
| 37 |
+
token_elem = root.find(".//oai:resumptionToken", NAMESPACES)
|
| 38 |
+
if token_elem is None or token_elem.text is None:
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
params = {"verb": "ListIdentifiers", "resumptionToken": token_elem.text}
|
| 42 |
+
return identifiers
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 46 |
+
def _fetch_metadata(context: AssetExecutionContext, identifier: int):
|
| 47 |
+
metadata_url = (
|
| 48 |
+
f"{BASE_URL}?verb=GetRecord&identifier={identifier}&metadataPrefix=ddi"
|
| 49 |
+
)
|
| 50 |
+
try:
|
| 51 |
+
response = requests.get(metadata_url)
|
| 52 |
+
response.raise_for_status()
|
| 53 |
+
except requests.HTTPError as http_err:
|
| 54 |
+
context.log.error(f"HTTP error occurred: {http_err}")
|
| 55 |
+
raise
|
| 56 |
+
except Exception as err:
|
| 57 |
+
context.log.error(f"Other error occurred: {err}")
|
| 58 |
+
raise
|
| 59 |
+
|
| 60 |
+
root = ET.fromstring(response.content)
|
| 61 |
+
return root
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@asset
|
| 65 |
+
def ukds_datasets(
|
| 66 |
+
context: AssetExecutionContext, ukds_identifiers: list[str]
|
| 67 |
+
) -> pl.DataFrame:
|
| 68 |
+
data = []
|
| 69 |
+
for identifier in tqdm(ukds_identifiers):
|
| 70 |
+
context.log.info(f"Fetching identifier {identifier}")
|
| 71 |
+
metadata = _fetch_metadata(context, identifier).find(
|
| 72 |
+
".//ns2:stdyDscr", NAMESPACES
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
if metadata is None:
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
abstract = "\n".join(
|
| 79 |
+
[
|
| 80 |
+
re.sub("<[^<]+?>", "", m.text)
|
| 81 |
+
for m in metadata.findall(".//ns2:abstract", NAMESPACES)
|
| 82 |
+
if m.text is not None
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
date = metadata.find(".//ns2:depDate", NAMESPACES)
|
| 86 |
+
title = metadata.find(".//ns2:titl", NAMESPACES)
|
| 87 |
+
keywords = [
|
| 88 |
+
m.text
|
| 89 |
+
for m in metadata.findall(".//ns2:keyword", NAMESPACES)
|
| 90 |
+
if m.text is not None
|
| 91 |
+
]
|
| 92 |
+
doi = metadata.find(".//ns2:holdings", NAMESPACES)
|
| 93 |
+
url = f"https://beta.ukdataservice.ac.uk/datacatalogue/studies/study?id={identifier}"
|
| 94 |
+
data.append(
|
| 95 |
+
{
|
| 96 |
+
"title": title.text if title is not None else None,
|
| 97 |
+
"abstract": abstract,
|
| 98 |
+
"date": date.text if date is not None else None,
|
| 99 |
+
"keywords": keywords,
|
| 100 |
+
"doi": doi.get("URI") if doi is not None else None,
|
| 101 |
+
"url": url,
|
| 102 |
+
}
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
df = pl.DataFrame(data)
|
| 106 |
+
df.write_parquet(Paths.UKDS / "ukds.parquet")
|
| 107 |
+
return df
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@asset
|
| 111 |
+
def ukds_abstracts(ukds_datasets: pl.DataFrame):
|
| 112 |
+
outdir = Paths.UKDS / "txt"
|
| 113 |
+
outdir.mkdir(parents=True, exist_ok=True)
|
| 114 |
+
|
| 115 |
+
for row in ukds_datasets.rows(named=True):
|
| 116 |
+
id = row["url"].split("=")[-1]
|
| 117 |
+
abstract = row["abstract"].replace(
|
| 118 |
+
"Abstract copyright UK Data Service and data collection copyright owner.",
|
| 119 |
+
"",
|
| 120 |
+
)
|
| 121 |
+
with open(outdir / f"{id}-abstract.txt", "w") as f:
|
| 122 |
+
f.write(f"Dataset Title: {row['title']}" f"\n\nDescription: \n\n{abstract}")
|
src/{jobs.py → datastore/jobs.py}
RENAMED
|
@@ -4,4 +4,17 @@ adr_job = define_asset_job(
|
|
| 4 |
"adr",
|
| 5 |
selection=["adr_session", "adr_datasets_id", "adr_datasets", "adr_descriptions"],
|
| 6 |
)
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"adr",
|
| 5 |
selection=["adr_session", "adr_datasets_id", "adr_datasets", "adr_descriptions"],
|
| 6 |
)
|
| 7 |
+
ukds_job = define_asset_job(
|
| 8 |
+
"ukds",
|
| 9 |
+
selection=["ukds_identifiers", "ukds_datasets", "ukds_abstracts"],
|
| 10 |
+
)
|
| 11 |
+
cdrc_job = define_asset_job(
|
| 12 |
+
"cdrc",
|
| 13 |
+
selection=[
|
| 14 |
+
"cdrc_session",
|
| 15 |
+
"cdrc_metadata",
|
| 16 |
+
"cdrc_notes",
|
| 17 |
+
"cdrc_resources",
|
| 18 |
+
"cdrc_pdfs",
|
| 19 |
+
],
|
| 20 |
+
)
|
src/datastore/loaders.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Iterator
|
| 3 |
+
|
| 4 |
+
import dateparser
|
| 5 |
+
import polars as pl
|
| 6 |
+
from langchain_community.document_loaders import PDFMinerLoader
|
| 7 |
+
from langchain_core.document_loaders import BaseLoader
|
| 8 |
+
from langchain_core.documents import Document
|
| 9 |
+
|
| 10 |
+
from src.common.utils import Paths
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CDRCLoader(BaseLoader):
|
| 14 |
+
def __init__(self, file_path: str) -> None:
|
| 15 |
+
self.file_path = file_path
|
| 16 |
+
|
| 17 |
+
def lazy_load(self) -> Iterator[Document]:
|
| 18 |
+
if self.file_path.endswith(".pdf"):
|
| 19 |
+
document = PDFMinerLoader(self.file_path).load()
|
| 20 |
+
metadata = self._add_cdrc_pdf_metadata(self.file_path)
|
| 21 |
+
document[0].metadata |= metadata
|
| 22 |
+
yield document[0]
|
| 23 |
+
elif self.file_path.endswith(".txt"):
|
| 24 |
+
with open(self.file_path, encoding="utf-8") as f:
|
| 25 |
+
yield Document(
|
| 26 |
+
page_content=f.read(),
|
| 27 |
+
metadata={"source": self.file_path}
|
| 28 |
+
| self._add_cdrc_txt_metadata(self.file_path),
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def _add_cdrc_txt_metadata(file_path: str) -> dict[str, str]:
|
| 33 |
+
id = Path(file_path).stem.rsplit("-", maxsplit=1)[0]
|
| 34 |
+
cdrc_meta = pl.read_parquet(Paths.CDRC / "cdrc_metadata.parquet")
|
| 35 |
+
|
| 36 |
+
metadata = cdrc_meta.filter(pl.col("id") == id)
|
| 37 |
+
iso_date = dateparser.parse(metadata["metadata_created"][0]).isoformat() # type: ignore
|
| 38 |
+
return {
|
| 39 |
+
"title": metadata["title"][0],
|
| 40 |
+
"id": metadata["id"][0],
|
| 41 |
+
"url": metadata["url"][0],
|
| 42 |
+
"date_created": iso_date,
|
| 43 |
+
"source": "CDRC",
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
@staticmethod
|
| 47 |
+
def _add_cdrc_pdf_metadata(file_path: str) -> dict[str, str]:
|
| 48 |
+
id = Path(file_path).stem
|
| 49 |
+
main_id = "-".join(id.split("-")[:5])
|
| 50 |
+
resource_id = "-".join(id.split("-")[5:])
|
| 51 |
+
|
| 52 |
+
cdrc_meta = pl.read_parquet(Paths.CDRC / "cdrc_metadata.parquet")
|
| 53 |
+
cdrc_pdf_meta = pl.read_parquet(Paths.CDRC / "cdrc_resource_metadata.parquet")
|
| 54 |
+
|
| 55 |
+
resource = cdrc_pdf_meta.filter(pl.col("resource_id") == resource_id)
|
| 56 |
+
metadata = cdrc_meta.filter(pl.col("id") == main_id)
|
| 57 |
+
|
| 58 |
+
iso_date = dateparser.parse(resource["created"][0]).isoformat() # type: ignore
|
| 59 |
+
return {
|
| 60 |
+
"title": metadata["title"][0],
|
| 61 |
+
"id": metadata["id"][0],
|
| 62 |
+
"url": metadata["url"][0],
|
| 63 |
+
"date_created": iso_date,
|
| 64 |
+
"source": "CDRC",
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class ADRLoader(BaseLoader):
|
| 69 |
+
def __init__(self, file_path: str) -> None:
|
| 70 |
+
self.file_path = file_path
|
| 71 |
+
|
| 72 |
+
def lazy_load(self) -> Iterator[Document]:
|
| 73 |
+
with open(self.file_path, encoding="utf-8") as f:
|
| 74 |
+
yield Document(
|
| 75 |
+
page_content=f.read(),
|
| 76 |
+
metadata={"source": self.file_path}
|
| 77 |
+
| self._add_adr_metadata(self.file_path),
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
@staticmethod
|
| 81 |
+
def _add_adr_metadata(file_path: str) -> dict[str, str]:
|
| 82 |
+
doc_id, origin_id, _ = Path(file_path).stem.split("-")
|
| 83 |
+
metadata = (
|
| 84 |
+
pl.scan_parquet(Paths.ADR / "adr_datasets.parquet")
|
| 85 |
+
.filter((pl.col("id") == doc_id) & (pl.col("origin_id") == origin_id))
|
| 86 |
+
.collect()[0]
|
| 87 |
+
.to_dict(as_series=False)
|
| 88 |
+
)
|
| 89 |
+
if len(metadata["id"]) == 0:
|
| 90 |
+
return {
|
| 91 |
+
"title": "",
|
| 92 |
+
"id": f"{doc_id}-{origin_id}",
|
| 93 |
+
"url": "",
|
| 94 |
+
"date_created": "",
|
| 95 |
+
"source": "ADR",
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
date_created = metadata["coverage_1"][0]["distributionReleaseDate"]
|
| 99 |
+
date_created = (
|
| 100 |
+
dateparser.parse(date_created).isoformat() # type: ignore
|
| 101 |
+
if isinstance(date_created, str)
|
| 102 |
+
else ""
|
| 103 |
+
)
|
| 104 |
+
return {
|
| 105 |
+
"title": metadata["name"][0],
|
| 106 |
+
"id": f"{doc_id}-{origin_id}",
|
| 107 |
+
"url": metadata["url"][0],
|
| 108 |
+
"date_created": date_created,
|
| 109 |
+
"source": "ADR",
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class UKDSLoader(BaseLoader):
|
| 114 |
+
def __init__(self, file_path: str) -> None:
|
| 115 |
+
self.file_path = file_path
|
| 116 |
+
|
| 117 |
+
def lazy_load(self) -> Iterator[Document]:
|
| 118 |
+
with open(self.file_path, encoding="utf-8") as f:
|
| 119 |
+
yield Document(
|
| 120 |
+
page_content=f.read(),
|
| 121 |
+
metadata={"source": self.file_path}
|
| 122 |
+
| self._add_ukds_metadata(self.file_path),
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
@staticmethod
|
| 126 |
+
def _add_ukds_metadata(file_path: str) -> dict[str, str]:
|
| 127 |
+
doc_id = Path(file_path).stem.split("-")[0]
|
| 128 |
+
metadata = (
|
| 129 |
+
pl.scan_parquet(Paths.UKDS / "ukds.parquet")
|
| 130 |
+
.with_columns(pl.col("url").str.split("=").list[1].alias("id"))
|
| 131 |
+
.filter(pl.col("id") == doc_id)
|
| 132 |
+
.collect()
|
| 133 |
+
.to_dict(as_series=False)
|
| 134 |
+
)
|
| 135 |
+
if len(metadata["id"]) == 0:
|
| 136 |
+
return {
|
| 137 |
+
"title": "",
|
| 138 |
+
"id": doc_id,
|
| 139 |
+
"url": "",
|
| 140 |
+
"date_created": "",
|
| 141 |
+
"source": "UKDS",
|
| 142 |
+
}
|
| 143 |
+
date_created = (
|
| 144 |
+
dateparser.parse(metadata["date"][0]).isoformat() # type: ignore
|
| 145 |
+
if isinstance(metadata["date"][0], str)
|
| 146 |
+
else ""
|
| 147 |
+
)
|
| 148 |
+
return {
|
| 149 |
+
"title": metadata["title"][0],
|
| 150 |
+
"id": doc_id,
|
| 151 |
+
"url": metadata["url"][0],
|
| 152 |
+
"date_created": date_created,
|
| 153 |
+
"source": "UKDS",
|
| 154 |
+
}
|
src/{resources.py → datastore/resources.py}
RENAMED
|
File without changes
|
src/datastore/schedules.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dagster import DefaultScheduleStatus, ScheduleDefinition
|
| 2 |
+
|
| 3 |
+
from src.datastore.jobs import adr_job, cdrc_job, ukds_job
|
| 4 |
+
|
| 5 |
+
adr_schedule = ScheduleDefinition(
|
| 6 |
+
job=adr_job,
|
| 7 |
+
cron_schedule="0 0 * * 0",
|
| 8 |
+
name="adr_weekly_schedule",
|
| 9 |
+
default_status=DefaultScheduleStatus.RUNNING,
|
| 10 |
+
)
|
| 11 |
+
cdrc_schedule = ScheduleDefinition(
|
| 12 |
+
job=cdrc_job,
|
| 13 |
+
cron_schedule="0 0 * * 0",
|
| 14 |
+
name="cdrc_weekly_schedule",
|
| 15 |
+
default_status=DefaultScheduleStatus.RUNNING,
|
| 16 |
+
)
|
| 17 |
+
ukds_schedule = ScheduleDefinition(
|
| 18 |
+
job=ukds_job,
|
| 19 |
+
cron_schedule="0 0 * * 0",
|
| 20 |
+
name="ukds_weekly_schedule",
|
| 21 |
+
default_status=DefaultScheduleStatus.RUNNING,
|
| 22 |
+
)
|
src/model/__init__.py
ADDED
|
File without changes
|
src/model/answer.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 3 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
| 4 |
+
from langchain_openai import ChatOpenAI
|
| 5 |
+
|
| 6 |
+
from src.common.settings import cfg
|
| 7 |
+
from src.model.grader import structured_llm_grader
|
| 8 |
+
|
| 9 |
+
_ = load_dotenv()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class GradeAnswer(BaseModel):
|
| 13 |
+
"""Binary score to assess answer addresses question."""
|
| 14 |
+
|
| 15 |
+
binary_score: str = Field(
|
| 16 |
+
description="Answer addresses the question, 'yes' or 'no'"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
|
| 21 |
+
structured_llm_grader = llm.with_structured_output(GradeAnswer)
|
| 22 |
+
|
| 23 |
+
# Prompt
|
| 24 |
+
system = """You are a grader assessing whether an answer addresses / resolves a question \n
|
| 25 |
+
Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
|
| 26 |
+
answer_prompt = ChatPromptTemplate.from_messages(
|
| 27 |
+
[
|
| 28 |
+
("system", system),
|
| 29 |
+
("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
|
| 30 |
+
]
|
| 31 |
+
)
|
| 32 |
+
answer_grader = answer_prompt | structured_llm_grader
|
src/model/dag.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@asset(compute_kind="OpenAI", deps=["pinecone_index"])
|
| 2 |
+
def search(context: AssetExecutionContext, openai: OpenAIResource):
|
| 3 |
+
with openai.get_client(context) as client:
|
| 4 |
+
embeddings = OpenAIEmbeddings(
|
| 5 |
+
client=client.embeddings, model=cfg.datastore.embed_model
|
| 6 |
+
)
|
| 7 |
+
vectorstore = PineconeVectorStore(
|
| 8 |
+
index_name=cfg.datastore.index_name, embedding=embeddings
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
# retriever = vectorstore.as_retriever()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
from typing import Any, Dict
|
| 15 |
+
|
| 16 |
+
import requests
|
| 17 |
+
from fastapi import FastAPI, HTTPException
|
| 18 |
+
from pydantic import BaseModel
|
| 19 |
+
|
| 20 |
+
app = FastAPI()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class RunConfig(BaseModel):
|
| 24 |
+
run_config: Dict[str, Any]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def launch_dagster_run(pipeline_name: str, run_config: dict, mode: str = "default"):
|
| 28 |
+
url = "http://<your-dagster-instance>/graphql"
|
| 29 |
+
headers = {
|
| 30 |
+
"Content-Type": "application/json",
|
| 31 |
+
}
|
| 32 |
+
query = """
|
| 33 |
+
mutation($pipelineName: String!, $runConfigData: RunConfigData, $mode: String!) {
|
| 34 |
+
launchPipelineExecution(
|
| 35 |
+
executionParams: {
|
| 36 |
+
selector: {
|
| 37 |
+
pipelineName: $pipelineName
|
| 38 |
+
},
|
| 39 |
+
runConfigData: $runConfigData,
|
| 40 |
+
mode: $mode
|
| 41 |
+
}
|
| 42 |
+
) {
|
| 43 |
+
__typename
|
| 44 |
+
... on LaunchPipelineRunSuccess {
|
| 45 |
+
run {
|
| 46 |
+
runId
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
... on PythonError {
|
| 50 |
+
message
|
| 51 |
+
stack
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
"""
|
| 56 |
+
variables = {
|
| 57 |
+
"pipelineName": pipeline_name,
|
| 58 |
+
"runConfigData": run_config,
|
| 59 |
+
"mode": mode,
|
| 60 |
+
}
|
| 61 |
+
response = requests.post(
|
| 62 |
+
url, json={"query": query, "variables": variables}, headers=headers
|
| 63 |
+
)
|
| 64 |
+
return response.json()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@app.post("/trigger-asset")
|
| 68 |
+
def trigger_asset(pipeline_name: str, run_config: RunConfig, mode: str = "default"):
|
| 69 |
+
try:
|
| 70 |
+
result = launch_dagster_run(pipeline_name, run_config.run_config, mode)
|
| 71 |
+
return result
|
| 72 |
+
except Exception as e:
|
| 73 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
import uvicorn
|
| 78 |
+
|
| 79 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/model/grader.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 3 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
| 4 |
+
from langchain_openai import ChatOpenAI
|
| 5 |
+
|
| 6 |
+
from src.common.settings import cfg
|
| 7 |
+
|
| 8 |
+
_ = load_dotenv()
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class GradeDocuments(BaseModel):
|
| 12 |
+
"""Binary score for relevance check on retrieved documents."""
|
| 13 |
+
|
| 14 |
+
binary_score: str = Field(
|
| 15 |
+
description="Documents are relevant to the query, 'yes' or 'no'"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
|
| 20 |
+
structured_llm_grader = llm.with_structured_output(GradeDocuments)
|
| 21 |
+
system = """
|
| 22 |
+
You are a grader assessing relevance of a retrieved document to a user query. \n
|
| 23 |
+
It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
|
| 24 |
+
If the document contains keyword(s) or semantic meaning related to the user query, grade it as relevant. \n
|
| 25 |
+
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the query.
|
| 26 |
+
"""
|
| 27 |
+
grade_prompt = ChatPromptTemplate.from_messages(
|
| 28 |
+
[
|
| 29 |
+
("system", system),
|
| 30 |
+
("human", "Retrieved document: \n\n {document} \n\n User query: {query}"),
|
| 31 |
+
]
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
retrieval_grader = grade_prompt | structured_llm_grader
|
src/model/hallucination.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 3 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
| 4 |
+
from langchain_openai import ChatOpenAI
|
| 5 |
+
|
| 6 |
+
from src.common.settings import cfg
|
| 7 |
+
from src.model.grader import structured_llm_grader
|
| 8 |
+
|
| 9 |
+
_ = load_dotenv()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class GradeHallucinations(BaseModel):
|
| 13 |
+
"""Binary score for hallucination present in generation answer."""
|
| 14 |
+
|
| 15 |
+
binary_score: str = Field(
|
| 16 |
+
description="Answer is grounded in the facts, 'yes' or 'no'"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
|
| 21 |
+
structured_llm_grader = llm.with_structured_output(GradeHallucinations)
|
| 22 |
+
|
| 23 |
+
system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
|
| 24 |
+
Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
|
| 25 |
+
hallucination_prompt = ChatPromptTemplate.from_messages(
|
| 26 |
+
[
|
| 27 |
+
("system", system),
|
| 28 |
+
("human", "Set of facts: \n\n {document} \n\n LLM generation: {generation}"),
|
| 29 |
+
]
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
hallucination_grader = hallucination_prompt | structured_llm_grader
|
src/model/model.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TypedDict
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
from langchain_openai import OpenAIEmbeddings
|
| 6 |
+
from langchain_pinecone import PineconeVectorStore
|
| 7 |
+
from langgraph.graph import END, START, StateGraph
|
| 8 |
+
|
| 9 |
+
from src.common.settings import cfg
|
| 10 |
+
from src.model.grader import retrieval_grader
|
| 11 |
+
from src.model.hallucination import hallucination_grader
|
| 12 |
+
from src.model.rag import rag_chain
|
| 13 |
+
|
| 14 |
+
_ = load_dotenv()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SearchState(TypedDict):
|
| 18 |
+
query: str
|
| 19 |
+
documents: list[str]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class GenerationState(TypedDict):
|
| 23 |
+
query: str
|
| 24 |
+
document: str
|
| 25 |
+
generation: str
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _group_by_document(documents):
|
| 29 |
+
grouped_id: dict[str, list[tuple[Document, float]]] = {}
|
| 30 |
+
|
| 31 |
+
for node, score in documents:
|
| 32 |
+
id = node.metadata["id"]
|
| 33 |
+
if id not in grouped_id:
|
| 34 |
+
grouped_id[id] = []
|
| 35 |
+
grouped_id[id].append((node, score))
|
| 36 |
+
|
| 37 |
+
out_nodes = []
|
| 38 |
+
for group in grouped_id.values():
|
| 39 |
+
nodes = [n[0] for n in group]
|
| 40 |
+
scores = [n[1] for n in group]
|
| 41 |
+
content = "\n--------------------\n".join([n.page_content for n in nodes])
|
| 42 |
+
document = Document(
|
| 43 |
+
page_content=content, metadata=nodes[0].metadata | {"score": max(scores)}
|
| 44 |
+
)
|
| 45 |
+
out_nodes.append(document)
|
| 46 |
+
return out_nodes
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def retrieve(state):
|
| 50 |
+
print("---RETRIEVE---")
|
| 51 |
+
embeddings = OpenAIEmbeddings(model=cfg.datastore.embed_model)
|
| 52 |
+
vectorstore = PineconeVectorStore(
|
| 53 |
+
index_name=cfg.datastore.index_name,
|
| 54 |
+
embedding=embeddings,
|
| 55 |
+
)
|
| 56 |
+
query = state["query"]
|
| 57 |
+
documents = vectorstore.similarity_search_with_score(query=query, k=cfg.model.top_k)
|
| 58 |
+
documents = _group_by_document(documents)
|
| 59 |
+
return {"documents": documents, "query": query}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def grade_documents(state):
|
| 63 |
+
print("---CHECK DOCUMENT RELEVANCE TO QUERY---")
|
| 64 |
+
query = state["query"]
|
| 65 |
+
documents = state["documents"]
|
| 66 |
+
|
| 67 |
+
filtered_docs = []
|
| 68 |
+
for d in documents:
|
| 69 |
+
score = retrieval_grader.invoke({"query": query, "document": d.page_content})
|
| 70 |
+
grade = score.binary_score # type: ignore
|
| 71 |
+
if grade == "yes":
|
| 72 |
+
print("---GRADE: DOCUMENT RELEVANT---")
|
| 73 |
+
filtered_docs.append(d)
|
| 74 |
+
else:
|
| 75 |
+
print("---GRADE: DOCUMENT NOT RELEVANT---")
|
| 76 |
+
continue
|
| 77 |
+
return {"documents": filtered_docs, "query": query}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def generation(state):
|
| 81 |
+
query = state["query"]
|
| 82 |
+
document = state["document"]
|
| 83 |
+
|
| 84 |
+
generation = rag_chain.invoke({"query": query, "context": document})
|
| 85 |
+
return {"query": query, "document": document, "generation": generation}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def grade_generation(state):
|
| 89 |
+
query = state["query"]
|
| 90 |
+
document = state["document"]
|
| 91 |
+
generation = state["generation"]
|
| 92 |
+
|
| 93 |
+
score = hallucination_grader.invoke(
|
| 94 |
+
{"document": document, "generation": generation}
|
| 95 |
+
)
|
| 96 |
+
grade = score.binary_score
|
| 97 |
+
|
| 98 |
+
if grade == "yes":
|
| 99 |
+
return {"query": query, "document": document, "generation": generation}
|
| 100 |
+
else:
|
| 101 |
+
return {"query": query, "document": document, "generation": "Hallucination"}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def search_graph():
|
| 105 |
+
workflow = StateGraph(SearchState)
|
| 106 |
+
workflow.add_node("retrieve", retrieve)
|
| 107 |
+
# workflow.add_node("grade_documents", grade_documents)
|
| 108 |
+
|
| 109 |
+
workflow.add_edge(START, "retrieve")
|
| 110 |
+
workflow.add_edge("retrieve", END)
|
| 111 |
+
# workflow.add_edge("retrieve", "grade_documents")
|
| 112 |
+
# workflow.add_edge("grade_documents", END)
|
| 113 |
+
return workflow.compile()
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def generation_graph():
|
| 117 |
+
workflow = StateGraph(GenerationState)
|
| 118 |
+
workflow.add_node("gen", generation)
|
| 119 |
+
workflow.add_node("grade_generation", grade_generation)
|
| 120 |
+
|
| 121 |
+
workflow.add_edge(START, "gen")
|
| 122 |
+
workflow.add_edge("gen", "grade_generation")
|
| 123 |
+
workflow.add_edge("grade_generation", END)
|
| 124 |
+
return workflow.compile()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
search_app = search_graph()
|
| 128 |
+
thread_id = 42
|
| 129 |
+
q = "What is the capital of France?"
|
| 130 |
+
|
| 131 |
+
out = search_app.invoke({"query": q}, config={"configurable": {"thread_id": thread_id}})
|
| 132 |
+
[d.dict() for d in out["documents"]]
|
src/model/rag.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 4 |
+
from langchain_openai import ChatOpenAI
|
| 5 |
+
|
| 6 |
+
from src.common.settings import cfg
|
| 7 |
+
|
| 8 |
+
_ = load_dotenv()
|
| 9 |
+
human = """
|
| 10 |
+
A user has queried a data catalogue, which has returned a relevant dataset.
|
| 11 |
+
|
| 12 |
+
Explain the relevance of this dataset to the query in under three sentences. Use your own knowledge or the data profile. Do not say it is unrelated; attempt to find a relevant connection.
|
| 13 |
+
|
| 14 |
+
Query: "{query}"
|
| 15 |
+
|
| 16 |
+
Dataset description:
|
| 17 |
+
|
| 18 |
+
{context}
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
gen_prompt = ChatPromptTemplate.from_messages([("human", human)])
|
| 22 |
+
llm = ChatOpenAI(model=cfg.model.llm, temperature=0)
|
| 23 |
+
rag_chain = gen_prompt | llm | StrOutputParser()
|
src/search_api/__init__.py
ADDED
|
File without changes
|
src/search_api/api.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from uuid import UUID, uuid4
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
|
| 6 |
+
from src.model.model import generation_graph, search_graph
|
| 7 |
+
|
| 8 |
+
app = FastAPI()
|
| 9 |
+
document_store = {}
|
| 10 |
+
query_mapping = {}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
search_app = search_graph()
|
| 14 |
+
gen_app = generation_graph()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@app.get("/")
|
| 18 |
+
def index():
|
| 19 |
+
return {"message": "Make a post request to /query."}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@app.post("/query")
|
| 23 |
+
async def query(q: str) -> dict:
|
| 24 |
+
thread_id = uuid4()
|
| 25 |
+
out = search_app.invoke(
|
| 26 |
+
{"query": q}, config={"configurable": {"thread_id": thread_id}}
|
| 27 |
+
)
|
| 28 |
+
docs_dict = [d.dict() for d in out["documents"]]
|
| 29 |
+
document_store[thread_id] = docs_dict
|
| 30 |
+
query_mapping[thread_id] = q
|
| 31 |
+
return {"thread_id": thread_id, "query": q, "documents": docs_dict}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@app.get("/explain/{thread_id}")
|
| 35 |
+
async def explain(thread_id: UUID, docid: int) -> dict:
|
| 36 |
+
doc_dict = document_store[thread_id][docid]
|
| 37 |
+
document = Document(
|
| 38 |
+
page_content=doc_dict["page_content"],
|
| 39 |
+
metadata=doc_dict["metadata"],
|
| 40 |
+
)
|
| 41 |
+
query = query_mapping[thread_id]
|
| 42 |
+
generation_state = gen_app.invoke(
|
| 43 |
+
{"query": query, "document": document},
|
| 44 |
+
config={"configurable": {"thread_id": thread_id}},
|
| 45 |
+
)
|
| 46 |
+
generation = generation_state["generation"]
|
| 47 |
+
|
| 48 |
+
return {
|
| 49 |
+
"generation": generation,
|
| 50 |
+
"metadata": {
|
| 51 |
+
"thread_id": thread_id,
|
| 52 |
+
"query": query,
|
| 53 |
+
"related_dataset": doc_dict,
|
| 54 |
+
},
|
| 55 |
+
}
|
src/search_api/streamlit_app.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from subprocess import Popen
|
| 2 |
+
from time import sleep
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def main():
|
| 9 |
+
st.title("CDRC Semantic Search App")
|
| 10 |
+
|
| 11 |
+
with st.spinner("Loading..."):
|
| 12 |
+
while True:
|
| 13 |
+
try:
|
| 14 |
+
r = requests.get("http://localhost:8000/")
|
| 15 |
+
if r.status_code == 200:
|
| 16 |
+
break
|
| 17 |
+
except requests.exceptions.ConnectionError:
|
| 18 |
+
Popen(["uvicorn", "search_service.api:app", "--port", "8000"])
|
| 19 |
+
sleep(10)
|
| 20 |
+
|
| 21 |
+
# use_llm = st.toggle("Activate LLM")
|
| 22 |
+
use_llm = False
|
| 23 |
+
text = st.text_input("Query")
|
| 24 |
+
if text == "":
|
| 25 |
+
return None
|
| 26 |
+
|
| 27 |
+
r = requests.get(
|
| 28 |
+
"http://localhost:8000/query", params={"q": text, "use_llm": use_llm}
|
| 29 |
+
)
|
| 30 |
+
if r.status_code != 200:
|
| 31 |
+
st.error("No results :(")
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
if use_llm:
|
| 35 |
+
response, metadata = r.json()
|
| 36 |
+
responses = []
|
| 37 |
+
for r in response["response"].split("---------------------"):
|
| 38 |
+
if all(x in r for x in ["Summary: ", "Relevance: "]):
|
| 39 |
+
responses.append(r)
|
| 40 |
+
else:
|
| 41 |
+
responses.append(None)
|
| 42 |
+
|
| 43 |
+
for res, meta in zip(responses, metadata.values(), strict=False):
|
| 44 |
+
st.subheader(meta["title"])
|
| 45 |
+
if res:
|
| 46 |
+
summary, relevance = res.split("Summary: ")[1].split("Relevance: ")
|
| 47 |
+
st.caption(summary)
|
| 48 |
+
st.caption(f":red[{relevance}]")
|
| 49 |
+
# st.caption(f"Score: :red[{meta['score']:.3f}]")
|
| 50 |
+
else:
|
| 51 |
+
st.caption("LLM did not return a response.")
|
| 52 |
+
if meta["url"] != "None":
|
| 53 |
+
st.write(meta["url"])
|
| 54 |
+
st.divider()
|
| 55 |
+
else:
|
| 56 |
+
metadata = r.json()
|
| 57 |
+
for meta in metadata:
|
| 58 |
+
st.subheader(meta["title"])
|
| 59 |
+
st.caption(f"Score: :red[{meta['score']:.3f}]")
|
| 60 |
+
if meta["url"] != "None":
|
| 61 |
+
st.write(meta["url"])
|
| 62 |
+
st.divider()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
main()
|
src/sensors.py
DELETED
|
@@ -1,2 +0,0 @@
|
|
| 1 |
-
# see https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors
|
| 2 |
-
# cursor to check which files are new
|
|
|
|
|
|
|
|
|