Upload 107 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +118 -0
- pyproject.toml +202 -0
- src/rag_pipelines/__init__.py +0 -0
- src/rag_pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
- src/rag_pipelines/embeddings/__init__.py +6 -0
- src/rag_pipelines/embeddings/__pycache__/__init__.cpython-310.pyc +0 -0
- src/rag_pipelines/embeddings/__pycache__/dense.cpython-310.pyc +0 -0
- src/rag_pipelines/embeddings/__pycache__/sparse_fastembed_qdrant.cpython-310.pyc +0 -0
- src/rag_pipelines/embeddings/__pycache__/sparse_milvus.cpython-310.pyc +0 -0
- src/rag_pipelines/embeddings/__pycache__/sparse_pinecone_text.cpython-310.pyc +0 -0
- src/rag_pipelines/embeddings/dense.py +85 -0
- src/rag_pipelines/embeddings/sparse_fastembed_qdrant.py +57 -0
- src/rag_pipelines/embeddings/sparse_milvus.py +67 -0
- src/rag_pipelines/embeddings/sparse_pinecone_text.py +58 -0
- src/rag_pipelines/evaluation/__init__.py +19 -0
- src/rag_pipelines/evaluation/evaluator.py +54 -0
- src/rag_pipelines/evaluation/response/__init__.py +0 -0
- src/rag_pipelines/evaluation/response/answer_relevancy.py +152 -0
- src/rag_pipelines/evaluation/response/faithfulness.py +132 -0
- src/rag_pipelines/evaluation/response/hallucination.py +127 -0
- src/rag_pipelines/evaluation/response/phoenix_hallucination.py +107 -0
- src/rag_pipelines/evaluation/response/summarization.py +158 -0
- src/rag_pipelines/evaluation/retrieval/__init__.py +0 -0
- src/rag_pipelines/evaluation/retrieval/contextual_precision.py +160 -0
- src/rag_pipelines/evaluation/retrieval/contextual_recall.py +127 -0
- src/rag_pipelines/evaluation/retrieval/contextual_relevancy.py +125 -0
- src/rag_pipelines/llms/__init__.py +3 -0
- src/rag_pipelines/llms/__pycache__/__init__.cpython-310.pyc +0 -0
- src/rag_pipelines/llms/__pycache__/groq.cpython-310.pyc +0 -0
- src/rag_pipelines/llms/groq.py +99 -0
- src/rag_pipelines/pipelines/__init__.py +3 -0
- src/rag_pipelines/pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
- src/rag_pipelines/pipelines/__pycache__/self_rag.cpython-310.pyc +0 -0
- src/rag_pipelines/pipelines/__pycache__/self_rag_graph_state.cpython-310.pyc +0 -0
- src/rag_pipelines/pipelines/adaptive_rag.py +0 -0
- src/rag_pipelines/pipelines/adaptive_rag_graph_state.py +18 -0
- src/rag_pipelines/pipelines/crag.py +172 -0
- src/rag_pipelines/pipelines/crag_graph_state.py +17 -0
- src/rag_pipelines/pipelines/dspy/dspy_baseline_rag.py +46 -0
- src/rag_pipelines/pipelines/dspy/dspy_bayesian_signature_optimization_rag.py +124 -0
- src/rag_pipelines/pipelines/dspy/dspy_bootstrap_few_shot_optimization_rag.py +60 -0
- src/rag_pipelines/pipelines/dspy/dspy_copro_instruction_signature_optimization_rag.py +150 -0
- src/rag_pipelines/pipelines/dspy_baseline_rag.py +81 -0
- src/rag_pipelines/pipelines/dspy_bayesian_signature_optimization_rag.py +119 -0
- src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_optimization_rag.py +91 -0
- src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_random_search_optimization_rag.py +103 -0
- src/rag_pipelines/pipelines/dspy_copro_instruction_signature_optimization_rag.py +121 -0
- src/rag_pipelines/pipelines/dspy_rag.py +47 -0
- src/rag_pipelines/pipelines/dspy_rag_module.py +39 -0
- src/rag_pipelines/pipelines/rag.py +146 -0
.gitignore
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local run files
|
| 2 |
+
qa.db
|
| 3 |
+
**/qa.db
|
| 4 |
+
**/*qa*.db
|
| 5 |
+
**/test-reports
|
| 6 |
+
|
| 7 |
+
# Byte-compiled / optimized / DLL files
|
| 8 |
+
__pycache__/
|
| 9 |
+
/pycache/*
|
| 10 |
+
**/pycache/*
|
| 11 |
+
*/*/pycache/*
|
| 12 |
+
*/*/*/pycache/*
|
| 13 |
+
*/*/*/*/pycache/*
|
| 14 |
+
*.py[cod]
|
| 15 |
+
*$py.class
|
| 16 |
+
|
| 17 |
+
# C extensions
|
| 18 |
+
*.so
|
| 19 |
+
|
| 20 |
+
# Distribution / packaging
|
| 21 |
+
.Python
|
| 22 |
+
build/
|
| 23 |
+
develop-eggs/
|
| 24 |
+
dist/
|
| 25 |
+
downloads/
|
| 26 |
+
eggs/
|
| 27 |
+
.eggs/
|
| 28 |
+
lib/
|
| 29 |
+
lib64/
|
| 30 |
+
parts/
|
| 31 |
+
sdist/
|
| 32 |
+
var/
|
| 33 |
+
wheels/
|
| 34 |
+
pip-wheel-metadata/
|
| 35 |
+
share/python-wheels/
|
| 36 |
+
*.egg-info/
|
| 37 |
+
.installed.cfg
|
| 38 |
+
*.egg
|
| 39 |
+
MANIFEST
|
| 40 |
+
|
| 41 |
+
# Unit test / coverage reports
|
| 42 |
+
htmlcov/
|
| 43 |
+
.tox/
|
| 44 |
+
.nox/
|
| 45 |
+
.coverage
|
| 46 |
+
.coverage.*
|
| 47 |
+
.cache
|
| 48 |
+
nosetests.xml
|
| 49 |
+
coverage.xml
|
| 50 |
+
*.cover
|
| 51 |
+
*.py,cover
|
| 52 |
+
.hypothesis/
|
| 53 |
+
.pytest_cache/
|
| 54 |
+
|
| 55 |
+
# Translations
|
| 56 |
+
*.mo
|
| 57 |
+
*.pot
|
| 58 |
+
|
| 59 |
+
# Django stuff:
|
| 60 |
+
*.log
|
| 61 |
+
local_settings.py
|
| 62 |
+
db.sqlite3
|
| 63 |
+
db.sqlite3-journal
|
| 64 |
+
|
| 65 |
+
# Flask stuff:
|
| 66 |
+
instance/
|
| 67 |
+
.webassets-cache
|
| 68 |
+
|
| 69 |
+
# Scrapy stuff:
|
| 70 |
+
.scrapy
|
| 71 |
+
|
| 72 |
+
# PyBuilder
|
| 73 |
+
target/
|
| 74 |
+
|
| 75 |
+
# Jupyter Notebook
|
| 76 |
+
.ipynb_checkpoints
|
| 77 |
+
|
| 78 |
+
# IPython
|
| 79 |
+
profile_default/
|
| 80 |
+
ipython_config.py
|
| 81 |
+
|
| 82 |
+
# pyenv
|
| 83 |
+
.python-version
|
| 84 |
+
|
| 85 |
+
# pyflow
|
| 86 |
+
__pypackages__/
|
| 87 |
+
|
| 88 |
+
# Environments
|
| 89 |
+
.env
|
| 90 |
+
.venv
|
| 91 |
+
env/
|
| 92 |
+
venv/
|
| 93 |
+
ENV/
|
| 94 |
+
env.bak/
|
| 95 |
+
venv.bak/
|
| 96 |
+
|
| 97 |
+
# mkdocs documentation
|
| 98 |
+
/site
|
| 99 |
+
|
| 100 |
+
# mypy
|
| 101 |
+
.mypy_cache/
|
| 102 |
+
.dmypy.json
|
| 103 |
+
dmypy.json
|
| 104 |
+
|
| 105 |
+
# Pyre type checker
|
| 106 |
+
.pyre/
|
| 107 |
+
|
| 108 |
+
# PyCharm
|
| 109 |
+
.idea
|
| 110 |
+
|
| 111 |
+
# VSCode
|
| 112 |
+
.vscode
|
| 113 |
+
|
| 114 |
+
# http cache (requests-cache)
|
| 115 |
+
**/http_cache.sqlite
|
| 116 |
+
|
| 117 |
+
# ruff
|
| 118 |
+
.ruff_cache
|
pyproject.toml
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling", "hatch-vcs"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "rag-pipelines"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = 'Advanced Retrieval Augmented Generation Pipelines'
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.9"
|
| 11 |
+
license = "MIT"
|
| 12 |
+
keywords = []
|
| 13 |
+
authors = [
|
| 14 |
+
{ name = "Ashwin Mathur", email = "" },
|
| 15 |
+
{ name = "Varun Mathur", email = "" },
|
| 16 |
+
]
|
| 17 |
+
classifiers = [
|
| 18 |
+
"License :: OSI Approved :: MIT License",
|
| 19 |
+
"Development Status :: 4 - Beta",
|
| 20 |
+
"Programming Language :: Python",
|
| 21 |
+
"Programming Language :: Python :: 3.9",
|
| 22 |
+
"Programming Language :: Python :: 3.10",
|
| 23 |
+
"Programming Language :: Python :: 3.11",
|
| 24 |
+
"Programming Language :: Python :: Implementation :: CPython",
|
| 25 |
+
"Programming Language :: Python :: Implementation :: PyPy",
|
| 26 |
+
]
|
| 27 |
+
dependencies = [
|
| 28 |
+
"dataloaders @ git+https://github.com/avnlp/dataloaders.git",
|
| 29 |
+
"langchain-core",
|
| 30 |
+
"langgraph",
|
| 31 |
+
"langchain-text-splitters",
|
| 32 |
+
"langchain-experimental",
|
| 33 |
+
"langchain-huggingface",
|
| 34 |
+
"langchain-groq",
|
| 35 |
+
"langchain_milvus",
|
| 36 |
+
"langchain-qdrant",
|
| 37 |
+
"langchain-pinecone",
|
| 38 |
+
"langchain-voyageai",
|
| 39 |
+
"spladerunner",
|
| 40 |
+
"haystack-ai",
|
| 41 |
+
"weave",
|
| 42 |
+
"edgartools",
|
| 43 |
+
"fastembed",
|
| 44 |
+
"pinecone-text[splade]",
|
| 45 |
+
"unstructured[pdf]",
|
| 46 |
+
"deepeval",
|
| 47 |
+
"arize-phoenix",
|
| 48 |
+
"dspy",
|
| 49 |
+
"dspy-ai[milvus]",
|
| 50 |
+
"optimum[onnxruntime]",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
[project.optional-dependencies]
|
| 54 |
+
dev = ["pytest"]
|
| 55 |
+
|
| 56 |
+
[project.urls]
|
| 57 |
+
Documentation = "https://github.com/avnlp/rag-pipelines#readme"
|
| 58 |
+
Issues = "https://github.com/avnlp/rag-pipelines/issues"
|
| 59 |
+
Source = "https://github.com/avnlp/rag-pipelines"
|
| 60 |
+
|
| 61 |
+
[tool.hatch.metadata]
|
| 62 |
+
allow-direct-references = true
|
| 63 |
+
|
| 64 |
+
[tool.hatch.build.targets.wheel]
|
| 65 |
+
packages = ["src/rag_pipelines"]
|
| 66 |
+
|
| 67 |
+
[tool.hatch.envs.default]
|
| 68 |
+
installer = "uv"
|
| 69 |
+
dependencies = [
|
| 70 |
+
"coverage[toml]>=6.5",
|
| 71 |
+
"pytest",
|
| 72 |
+
"pytest-rerunfailures",
|
| 73 |
+
"pytest-mock",
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
[tool.hatch.envs.default.scripts]
|
| 77 |
+
test = "pytest -vv {args:tests}"
|
| 78 |
+
test-cov = "coverage run -m pytest {args:tests}"
|
| 79 |
+
test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x"
|
| 80 |
+
cov-report = ["- coverage combine", "coverage report"]
|
| 81 |
+
cov = ["test-cov", "cov-report"]
|
| 82 |
+
cov-retry = ["test-cov-retry", "cov-report"]
|
| 83 |
+
|
| 84 |
+
[[tool.hatch.envs.test.matrix]]
|
| 85 |
+
python = ["39", "310", "311"]
|
| 86 |
+
|
| 87 |
+
[tool.hatch.envs.lint]
|
| 88 |
+
installer = "uv"
|
| 89 |
+
detached = true
|
| 90 |
+
dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
|
| 91 |
+
|
| 92 |
+
[tool.hatch.envs.lint.scripts]
|
| 93 |
+
typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
|
| 94 |
+
style = ["ruff check {args:}", "black --check --diff {args:.}"]
|
| 95 |
+
fmt = ["black {args:.}", "ruff check --fix --unsafe-fixes {args:}", "style"]
|
| 96 |
+
all = ["style", "typing"]
|
| 97 |
+
|
| 98 |
+
[tool.coverage.run]
|
| 99 |
+
source = ["rag_pipelines"]
|
| 100 |
+
branch = true
|
| 101 |
+
parallel = true
|
| 102 |
+
|
| 103 |
+
[tool.coverage.report]
|
| 104 |
+
omit = ["*/tests/*", "*/__init__.py"]
|
| 105 |
+
show_missing = true
|
| 106 |
+
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
|
| 107 |
+
|
| 108 |
+
[tool.ruff]
|
| 109 |
+
target-version = "py39"
|
| 110 |
+
line-length = 120
|
| 111 |
+
|
| 112 |
+
[tool.ruff.lint]
|
| 113 |
+
select = [
|
| 114 |
+
"A",
|
| 115 |
+
"ARG",
|
| 116 |
+
"B",
|
| 117 |
+
"C",
|
| 118 |
+
"D",
|
| 119 |
+
"D401",
|
| 120 |
+
"DTZ",
|
| 121 |
+
"E",
|
| 122 |
+
"EM",
|
| 123 |
+
"F",
|
| 124 |
+
"I",
|
| 125 |
+
"ICN",
|
| 126 |
+
"ISC",
|
| 127 |
+
"N",
|
| 128 |
+
"PLC",
|
| 129 |
+
"PLE",
|
| 130 |
+
"PLR",
|
| 131 |
+
"PLW",
|
| 132 |
+
"Q",
|
| 133 |
+
"RUF",
|
| 134 |
+
"S",
|
| 135 |
+
"T",
|
| 136 |
+
"TID",
|
| 137 |
+
"UP",
|
| 138 |
+
"W",
|
| 139 |
+
"YTT",
|
| 140 |
+
]
|
| 141 |
+
ignore = [
|
| 142 |
+
# Allow non-abstract empty methods in abstract base classes
|
| 143 |
+
"B027",
|
| 144 |
+
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
| 145 |
+
"FBT003",
|
| 146 |
+
# Ignore checks for possible passwords
|
| 147 |
+
"S102",
|
| 148 |
+
"S105",
|
| 149 |
+
"S106",
|
| 150 |
+
"S107",
|
| 151 |
+
# Ignore complexity
|
| 152 |
+
"C901",
|
| 153 |
+
"PLR0911",
|
| 154 |
+
"PLR0912",
|
| 155 |
+
"PLR0913",
|
| 156 |
+
"PLR0915",
|
| 157 |
+
# Allow print statements
|
| 158 |
+
"T201",
|
| 159 |
+
# Ignore missing module docstrings
|
| 160 |
+
"D100",
|
| 161 |
+
"D104",
|
| 162 |
+
# Ignore Line too long
|
| 163 |
+
"E501",
|
| 164 |
+
# Ignore builtin argument shadowing
|
| 165 |
+
"A002",
|
| 166 |
+
# Ignore builtin module shadowing
|
| 167 |
+
"A005",
|
| 168 |
+
# Ignore Function calls in argument defaults
|
| 169 |
+
"B008",
|
| 170 |
+
"ARG002",
|
| 171 |
+
"ARG005",
|
| 172 |
+
]
|
| 173 |
+
unfixable = [
|
| 174 |
+
# Don't touch unused imports
|
| 175 |
+
"F401",
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
[tool.ruff.lint.pydocstyle]
|
| 179 |
+
convention = "google"
|
| 180 |
+
|
| 181 |
+
[tool.ruff.lint.isort]
|
| 182 |
+
known-first-party = ["rag_pipelines"]
|
| 183 |
+
|
| 184 |
+
[tool.ruff.lint.flake8-tidy-imports]
|
| 185 |
+
ban-relative-imports = "parents"
|
| 186 |
+
|
| 187 |
+
[tool.ruff.lint.per-file-ignores]
|
| 188 |
+
# Tests can use magic values, assertions, and relative imports
|
| 189 |
+
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
| 190 |
+
|
| 191 |
+
[tool.pytest.ini_options]
|
| 192 |
+
minversion = "6.0"
|
| 193 |
+
addopts = "--strict-markers"
|
| 194 |
+
markers = ["integration: integration tests"]
|
| 195 |
+
log_cli = true
|
| 196 |
+
|
| 197 |
+
[tool.black]
|
| 198 |
+
line-length = 120
|
| 199 |
+
|
| 200 |
+
[[tool.mypy.overrides]]
|
| 201 |
+
module = ["rag_pipelines.*", "pytest.*", "numpy.*"]
|
| 202 |
+
ignore_missing_imports = true
|
src/rag_pipelines/__init__.py
ADDED
|
File without changes
|
src/rag_pipelines/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (164 Bytes). View file
|
|
|
src/rag_pipelines/embeddings/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag_pipelines.embeddings.dense import DenseEmbeddings
|
| 2 |
+
from rag_pipelines.embeddings.sparse_fastembed_qdrant import SparseEmbeddings
|
| 3 |
+
from rag_pipelines.embeddings.sparse_milvus import SparseEmbeddingsMilvus
|
| 4 |
+
from rag_pipelines.embeddings.sparse_pinecone_text import SparseEmbeddingsSplade
|
| 5 |
+
|
| 6 |
+
__all__ = ["DenseEmbeddings", "SparseEmbeddings", "SparseEmbeddingsMilvus", "SparseEmbeddingsSplade"]
|
src/rag_pipelines/embeddings/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (552 Bytes). View file
|
|
|
src/rag_pipelines/embeddings/__pycache__/dense.cpython-310.pyc
ADDED
|
Binary file (3.36 kB). View file
|
|
|
src/rag_pipelines/embeddings/__pycache__/sparse_fastembed_qdrant.cpython-310.pyc
ADDED
|
Binary file (2.73 kB). View file
|
|
|
src/rag_pipelines/embeddings/__pycache__/sparse_milvus.cpython-310.pyc
ADDED
|
Binary file (3.37 kB). View file
|
|
|
src/rag_pipelines/embeddings/__pycache__/sparse_pinecone_text.cpython-310.pyc
ADDED
|
Binary file (2.61 kB). View file
|
|
|
src/rag_pipelines/embeddings/dense.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Optional
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DenseEmbeddings(weave.Model):
|
| 8 |
+
"""Generate dense embeddings for documents and queries using a specified SentenceTransformer model.
|
| 9 |
+
|
| 10 |
+
This class leverages HuggingFace's `HuggingFaceEmbeddings` to compute dense embeddings for input text.
|
| 11 |
+
|
| 12 |
+
Attributes:
|
| 13 |
+
model_name (str): The name of the pre-trained embedding model to use.
|
| 14 |
+
model_kwargs (Optional[Dict[str, Any]]): Additional configuration parameters for the embedding model.
|
| 15 |
+
encode_kwargs (Optional[Dict[str, Any]]): Parameters for fine-tuning the behavior of the encoding process.
|
| 16 |
+
embedding_model (HuggingFaceEmbeddings): The initialized HuggingFace embeddings model with the specified settings.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
model_name: str
|
| 20 |
+
model_kwargs: Optional[dict[str, Any]]
|
| 21 |
+
encode_kwargs: Optional[dict[str, Any]]
|
| 22 |
+
show_progress: bool
|
| 23 |
+
embedding_model: Optional[HuggingFaceEmbeddings] = None
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
| 28 |
+
model_kwargs: Optional[dict[str, Any]] = None,
|
| 29 |
+
encode_kwargs: Optional[dict[str, Any]] = None,
|
| 30 |
+
show_progress: bool = True,
|
| 31 |
+
):
|
| 32 |
+
"""Initialize the DenseEmbeddings class with the specified model and configurations.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
model_name (str): The name of the pre-trained embedding model. Defaults to "sentence-transformers/all-MiniLM-L6-v2".
|
| 36 |
+
model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
|
| 37 |
+
encode_kwargs (Optional[dict[str, Any]]): Parameters for encoding settings. Defaults to None.
|
| 38 |
+
show_progress (bool): Whether to display progress during model operations. Defaults to True.
|
| 39 |
+
"""
|
| 40 |
+
if encode_kwargs is None:
|
| 41 |
+
encode_kwargs = {"normalize_embeddings": True}
|
| 42 |
+
if model_kwargs is None:
|
| 43 |
+
model_kwargs = {"device": "cpu"}
|
| 44 |
+
super().__init__(
|
| 45 |
+
model_name=model_name,
|
| 46 |
+
model_kwargs=model_kwargs,
|
| 47 |
+
encode_kwargs=encode_kwargs,
|
| 48 |
+
show_progress=show_progress,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
self.model_name = model_name
|
| 52 |
+
self.model_kwargs = model_kwargs if model_kwargs is not None else {}
|
| 53 |
+
self.encode_kwargs = encode_kwargs if encode_kwargs is not None else {}
|
| 54 |
+
|
| 55 |
+
# Initialize the embedding model with the specified parameters
|
| 56 |
+
self.embedding_model = HuggingFaceEmbeddings(
|
| 57 |
+
model_name=self.model_name,
|
| 58 |
+
model_kwargs=self.model_kwargs,
|
| 59 |
+
encode_kwargs=self.encode_kwargs,
|
| 60 |
+
show_progress=show_progress,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
@weave.op()
|
| 64 |
+
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
| 65 |
+
"""Embed a list of texts and return their embeddings.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
texts (list[str]): A list of texts to embed.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
list[list[float]]: A list of embedding vectors corresponding to each input text.
|
| 72 |
+
"""
|
| 73 |
+
return self.embedding_model.embed_documents(texts)
|
| 74 |
+
|
| 75 |
+
@weave.op()
|
| 76 |
+
def embed_query(self, text: str) -> list[float]:
|
| 77 |
+
"""Embed a single query text and returns its embedding.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
text (str): The query text to be embedded.
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
List[float]: The embedding vector for the query text.
|
| 84 |
+
"""
|
| 85 |
+
return self.embedding_model.embed_query(text)
|
src/rag_pipelines/embeddings/sparse_fastembed_qdrant.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Optional
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from langchain_qdrant.fastembed_sparse import FastEmbedSparse
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SparseEmbeddings(weave.Model):
|
| 8 |
+
"""Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
|
| 9 |
+
|
| 10 |
+
Attributes:
|
| 11 |
+
model_name (str): The name of the sparse embedding model to use.
|
| 12 |
+
model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
|
| 13 |
+
sparse_embedding_model (FastEmbedSparse): The initialized FastEmbedSparse model with the specified parameters.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
model_name: str = "prithvida/Splade_PP_en_v1",
|
| 19 |
+
model_kwargs: Optional[dict[str, Any]] = None,
|
| 20 |
+
):
|
| 21 |
+
"""Initialize the SparseEmbeddings class with the specified model and configurations.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
model_name (str): The name of the sparse embedding model. Defaults to "prithvida/Splade_PP_en_v1".
|
| 25 |
+
model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
|
| 26 |
+
"""
|
| 27 |
+
self.model_name = model_name
|
| 28 |
+
self.model_kwargs = model_kwargs if model_kwargs is not None else {}
|
| 29 |
+
|
| 30 |
+
# Initialize the sparse embedding model with specified parameters
|
| 31 |
+
self.sparse_embedding_model = FastEmbedSparse(model_name=self.model_name, **self.model_kwargs)
|
| 32 |
+
|
| 33 |
+
@weave.op()
|
| 34 |
+
def embed_texts(self, texts: list[str]) -> list[dict[str, float]]:
|
| 35 |
+
"""Embed a list of texts and return their sparse embeddings.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
texts (list[str]): A list of document texts to embed.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
list[dict[str, float]]: A list of sparse embedding dictionaries for each document text.
|
| 42 |
+
Each dictionary maps terms to their corresponding weights.
|
| 43 |
+
"""
|
| 44 |
+
return self.sparse_embedding_model.embed_documents(texts)
|
| 45 |
+
|
| 46 |
+
@weave.op()
|
| 47 |
+
def embed_query(self, text: str) -> dict[str, float]:
|
| 48 |
+
"""Embed a single query text and return its sparse embedding.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
text (str): The query text to embed.
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
dict[str, float]: A sparse embedding dictionary for the query text, where keys are terms
|
| 55 |
+
and values are term weights.
|
| 56 |
+
"""
|
| 57 |
+
return self.sparse_embedding_model.embed_query(text)
|
src/rag_pipelines/embeddings/sparse_milvus.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Optional
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from langchain_milvus.utils.sparse import BaseSparseEmbedding
|
| 5 |
+
from spladerunner import Expander
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SparseEmbeddingsMilvus(BaseSparseEmbedding):
|
| 9 |
+
"""Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
|
| 10 |
+
|
| 11 |
+
Attributes:
|
| 12 |
+
model_name (str): The name of the sparse embedding model to use.
|
| 13 |
+
model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
|
| 14 |
+
sparse_embedding_model (FastEmbedSparse): The initialized FastEmbedSparse model with the specified parameters.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
model_name: str
|
| 18 |
+
model_kwargs: Optional[dict[str, Any]] = None
|
| 19 |
+
sparse_embedding_model: Optional[Any] = None
|
| 20 |
+
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
model_name: str = "Splade_PP_en_v1",
|
| 24 |
+
max_length: int = 512,
|
| 25 |
+
):
|
| 26 |
+
"""Initialize the SparseEmbeddings class with the specified model and configurations.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
model_name (str): The name of the sparse embedding model. Defaults to "Splade_PP_en_v1".
|
| 30 |
+
model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
|
| 31 |
+
"""
|
| 32 |
+
self.model_name = model_name
|
| 33 |
+
self.max_length = max_length
|
| 34 |
+
|
| 35 |
+
# Initialize the sparse embedding model with specified parameters
|
| 36 |
+
self.sparse_embedding_model = Expander(model_name=self.model_name, max_length=self.max_length)
|
| 37 |
+
|
| 38 |
+
def _sparse_to_dict(self, sparse_vector: Any) -> dict[int, float]:
|
| 39 |
+
return dict(zip(sparse_vector["indices"], sparse_vector["values"]))
|
| 40 |
+
|
| 41 |
+
@weave.op()
|
| 42 |
+
def embed_query(self, text: str) -> dict[int, float]:
|
| 43 |
+
"""Embed a single query text and return its sparse embedding.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
text (str): The query text to embed.
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
dict[int, float]: A sparse embedding dictionary for the query text, where keys are terms
|
| 50 |
+
and values are term weights.
|
| 51 |
+
"""
|
| 52 |
+
sparse_embeddings = list(self.sparse_embedding_model.expand([text]))
|
| 53 |
+
return self._sparse_to_dict(sparse_embeddings[0])
|
| 54 |
+
|
| 55 |
+
@weave.op()
|
| 56 |
+
def embed_documents(self, texts: list[str]) -> list[dict[int, float]]:
|
| 57 |
+
"""Embed a list of texts and return their sparse embeddings.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
texts (list[str]): A list of document texts to embed.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
list[dict[int, float]]: A list of sparse embedding dictionaries for each document text.
|
| 64 |
+
Each dictionary maps terms to their corresponding weights.
|
| 65 |
+
"""
|
| 66 |
+
sparse_embeddings = list(self.sparse_embedding_model.expand(texts))
|
| 67 |
+
return [self._sparse_to_dict(sparse_embeddings[i]) for i in range(len(texts))]
|
src/rag_pipelines/embeddings/sparse_pinecone_text.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Optional
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from pinecone_text.sparse import SpladeEncoder
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SparseEmbeddingsSplade(weave.Model):
|
| 8 |
+
"""Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
|
| 9 |
+
|
| 10 |
+
Attributes:
|
| 11 |
+
model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
|
| 12 |
+
sparse_embedding_model (SpladeEncoder): The FastEmbedSparse model initialized with the specified parameters.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
model_kwargs: Optional[dict[str, Any]]
|
| 16 |
+
sparse_embedding_model: Optional[SpladeEncoder] = None
|
| 17 |
+
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
model_kwargs: Optional[dict[str, Any]] = None,
|
| 21 |
+
):
|
| 22 |
+
"""Initialize the SparseEmbeddings class with the specified model and configurations.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization.
|
| 26 |
+
"""
|
| 27 |
+
super().__init__(model_kwargs=model_kwargs)
|
| 28 |
+
|
| 29 |
+
self.model_kwargs = model_kwargs if model_kwargs is not None else {}
|
| 30 |
+
|
| 31 |
+
# Initialize the sparse embedding model with specified parameters
|
| 32 |
+
self.sparse_embedding_model = SpladeEncoder(**self.model_kwargs)
|
| 33 |
+
|
| 34 |
+
@weave.op()
|
| 35 |
+
def embed_texts(self, texts: list[str]) -> list[dict[str, float]]:
|
| 36 |
+
"""Embed a list of texts and return their sparse embeddings.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
texts (list[str]): A list of document texts to embed.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
list[dict[str, float]]: A list of sparse embedding dictionaries for each document text.
|
| 43 |
+
Each dictionary maps terms to their corresponding weights.
|
| 44 |
+
"""
|
| 45 |
+
return self.sparse_embedding_model.encode_documents(texts)
|
| 46 |
+
|
| 47 |
+
@weave.op()
|
| 48 |
+
def embed_query(self, text: str) -> dict[str, float]:
|
| 49 |
+
"""Embed a single query text and return its sparse embedding.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
text (str): The query text to embed.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
dict[str, float]: A sparse embedding dictionary for the query text, where keys are terms
|
| 56 |
+
and values are term weights.
|
| 57 |
+
"""
|
| 58 |
+
return self.sparse_embedding_model.encode_queries([text])
|
src/rag_pipelines/evaluation/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag_pipelines.evaluation.evaluator import Evaluator
|
| 2 |
+
from rag_pipelines.evaluation.response.answer_relevancy import AnswerRelevancyScorer
|
| 3 |
+
from rag_pipelines.evaluation.response.faithfulness import FaithfulnessScorer
|
| 4 |
+
from rag_pipelines.evaluation.response.hallucination import HallucinationScorer
|
| 5 |
+
from rag_pipelines.evaluation.response.summarization import SummarizationScorer
|
| 6 |
+
from rag_pipelines.evaluation.retrieval.contextual_precision import ContextualPrecisionScorer
|
| 7 |
+
from rag_pipelines.evaluation.retrieval.contextual_recall import ContextualRecallScorer
|
| 8 |
+
from rag_pipelines.evaluation.retrieval.contextual_relevancy import ContextualRelevancyScorer
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"AnswerRelevancyScorer",
|
| 12 |
+
"ContextualPrecisionScorer",
|
| 13 |
+
"ContextualRecallScorer",
|
| 14 |
+
"ContextualRelevancyScorer",
|
| 15 |
+
"Evaluator",
|
| 16 |
+
"FaithfulnessScorer",
|
| 17 |
+
"HallucinationScorer",
|
| 18 |
+
"SummarizationScorer",
|
| 19 |
+
]
|
src/rag_pipelines/evaluation/evaluator.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
|
| 3 |
+
from weave import Dataset, Evaluation, Model, Scorer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Evaluator:
|
| 7 |
+
"""Evaluate a model on a dataset using a list of scorers.
|
| 8 |
+
|
| 9 |
+
Attributes:
|
| 10 |
+
evaluation_name (str): The name of the evaluation run.
|
| 11 |
+
evaluation_dataset (Dataset): The dataset used for evaluation.
|
| 12 |
+
evaluation_scorers (list[Scorer]): A list of scorer objects used to evaluate the pipeline.
|
| 13 |
+
pipeline (Model): The pipeline (model) to be evaluated.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
evaluation_name: str,
|
| 19 |
+
evaluation_dataset: Dataset,
|
| 20 |
+
evaluation_scorers: list[Scorer],
|
| 21 |
+
pipeline: Model,
|
| 22 |
+
):
|
| 23 |
+
"""Initialize the Evaluator instance with the specified evaluation parameters.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
evaluation_name (str): A unique identifier for the evaluation run.
|
| 27 |
+
evaluation_dataset (Dataset): A `Dataset` object representing the data for evaluation.
|
| 28 |
+
evaluation_scorers (list[Scorer]): A list of `Scorer` objects that calculate various metrics.
|
| 29 |
+
pipeline (Model): The model or pipeline to evaluate.
|
| 30 |
+
"""
|
| 31 |
+
self.evaluation_name = evaluation_name
|
| 32 |
+
self.evaluation_dataset = evaluation_dataset
|
| 33 |
+
self.evaluation_scorers = evaluation_scorers
|
| 34 |
+
self.pipeline = pipeline
|
| 35 |
+
|
| 36 |
+
def evaluate(self) -> None:
|
| 37 |
+
"""Perform evaluation of the pipeline using the specified dataset and scorers.
|
| 38 |
+
|
| 39 |
+
This method creates an `Evaluation` object, executes the evaluation process, and
|
| 40 |
+
returns the results as a dictionary.
|
| 41 |
+
"""
|
| 42 |
+
evaluation = Evaluation(
|
| 43 |
+
evaluation_name=self.evaluation_name,
|
| 44 |
+
dataset=self.evaluation_dataset,
|
| 45 |
+
scorers=self.evaluation_scorers,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
evaluation_results = asyncio.run(evaluation.evaluate(self.pipeline))
|
| 50 |
+
except Exception as exception:
|
| 51 |
+
msg = f"Evaluation run failed: {exception}"
|
| 52 |
+
raise RuntimeError(msg) from exception
|
| 53 |
+
|
| 54 |
+
return evaluation_results
|
src/rag_pipelines/evaluation/response/__init__.py
ADDED
|
File without changes
|
src/rag_pipelines/evaluation/response/answer_relevancy.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from statistics import variance
|
| 2 |
+
from typing import Optional, Union
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import weave
|
| 6 |
+
from deepeval.metrics import AnswerRelevancyMetric
|
| 7 |
+
from deepeval.test_case import LLMTestCase
|
| 8 |
+
from weave import Scorer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AnswerRelevancyScorer(Scorer):
|
| 12 |
+
"""Evaluate the relevancy of answers generated by a LLM.
|
| 13 |
+
|
| 14 |
+
This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
|
| 15 |
+
compared to the input query.
|
| 16 |
+
|
| 17 |
+
The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
|
| 18 |
+
actual output of an LLM application is in relation to the input query.
|
| 19 |
+
|
| 20 |
+
Attributes:
|
| 21 |
+
threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
|
| 22 |
+
model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
|
| 23 |
+
include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
|
| 24 |
+
strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
|
| 25 |
+
1. Defaults to False.
|
| 26 |
+
async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
|
| 27 |
+
verbose (bool): Whether to print intermediate steps to the console, defaults to False.
|
| 28 |
+
metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
threshold: float = Optional[None]
|
| 32 |
+
model: str = Optional[None]
|
| 33 |
+
include_reason: bool = Optional[None]
|
| 34 |
+
strict_mode: bool = Optional[None]
|
| 35 |
+
async_mode: bool = Optional[None]
|
| 36 |
+
verbose: bool = Optional[None]
|
| 37 |
+
metric: AnswerRelevancyMetric = Optional[None]
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
threshold: float = 0.5,
|
| 42 |
+
model: str = "gpt-4",
|
| 43 |
+
include_reason: bool = True,
|
| 44 |
+
strict_mode: bool = False,
|
| 45 |
+
async_mode: bool = True,
|
| 46 |
+
verbose: bool = False,
|
| 47 |
+
):
|
| 48 |
+
"""Initialize the AnswerRelevancy Scorer with the specified parameters.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
|
| 52 |
+
model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
|
| 53 |
+
include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
|
| 54 |
+
strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
|
| 55 |
+
async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
|
| 56 |
+
verbose (bool): Whether to print intermediate steps to the console, defaults to False.
|
| 57 |
+
"""
|
| 58 |
+
super().__init__(
|
| 59 |
+
threshold=threshold,
|
| 60 |
+
model=model,
|
| 61 |
+
include_reason=include_reason,
|
| 62 |
+
strict_mode=strict_mode,
|
| 63 |
+
async_mode=async_mode,
|
| 64 |
+
verbose=verbose,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
self.threshold = threshold
|
| 68 |
+
self.model = model
|
| 69 |
+
self.include_reason = include_reason
|
| 70 |
+
self.strict_mode = strict_mode
|
| 71 |
+
self.async_mode = async_mode
|
| 72 |
+
self.verbose = verbose
|
| 73 |
+
|
| 74 |
+
self.metric = AnswerRelevancyMetric(
|
| 75 |
+
threshold=self.threshold,
|
| 76 |
+
model=self.model,
|
| 77 |
+
include_reason=self.include_reason,
|
| 78 |
+
async_mode=self.async_mode,
|
| 79 |
+
strict_mode=self.strict_mode,
|
| 80 |
+
verbose_mode=self.verbose,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
@weave.op
|
| 84 |
+
def score(
|
| 85 |
+
self,
|
| 86 |
+
input: str,
|
| 87 |
+
output: Optional[dict] = None,
|
| 88 |
+
expected_output: Optional[str] = None,
|
| 89 |
+
context: Optional[list[str]] = None,
|
| 90 |
+
) -> dict[str, Union[str, float]]:
|
| 91 |
+
"""Evaluate the relevancy and accuracy of answers generated by a LLM.
|
| 92 |
+
|
| 93 |
+
The AnswerRelevancy score is calculated according to the following equation:
|
| 94 |
+
|
| 95 |
+
Answer Relevancy = Total Number of Statements / Number of Relevant Statements
|
| 96 |
+
|
| 97 |
+
The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
input (str): The input query or prompt that triggered the output.
|
| 102 |
+
output (dict): The LLM generated response to evaluate and the retrieval context.
|
| 103 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 104 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 108 |
+
- "score" (float): The computed answer relevancy score.
|
| 109 |
+
"""
|
| 110 |
+
test_case = LLMTestCase(
|
| 111 |
+
input=input,
|
| 112 |
+
actual_output=output.get("output", ""),
|
| 113 |
+
expected_output=expected_output,
|
| 114 |
+
retrieval_context=output.get("retrieval_context", [""]),
|
| 115 |
+
context=context,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
result: dict[str, Union[str, float]] = {}
|
| 119 |
+
|
| 120 |
+
self.metric.measure(test_case)
|
| 121 |
+
result = {"score": self.metric.score}
|
| 122 |
+
|
| 123 |
+
return result
|
| 124 |
+
|
| 125 |
+
@weave.op()
|
| 126 |
+
def summarize(self, score_rows: list) -> dict:
|
| 127 |
+
"""Summarize the results of the AnswerRelevancy Scorer.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
score_rows (list): A list of dictionaries containing the following keys:
|
| 131 |
+
- "score" (float): The computed answer relevancy score.
|
| 132 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
dict: A dictionary containing the following keys:
|
| 136 |
+
- "answer_relevancy_score" (dict): A dictionary containing the following keys:
|
| 137 |
+
- "score" (float): The average answer relevancy score.
|
| 138 |
+
- "variance" (float): The variance of the answer relevancy scores.
|
| 139 |
+
- "std" (float): The standard deviation of the answer relevancy scores.
|
| 140 |
+
- "count" (int): The number of answer relevancy scores.
|
| 141 |
+
"""
|
| 142 |
+
scores = []
|
| 143 |
+
for row in score_rows:
|
| 144 |
+
score = row.get("score", 0.0)
|
| 145 |
+
scores.append(float(score))
|
| 146 |
+
|
| 147 |
+
score = np.mean(scores).item()
|
| 148 |
+
variance = np.var(scores).item()
|
| 149 |
+
std = np.std(scores).item()
|
| 150 |
+
count = len(scores)
|
| 151 |
+
|
| 152 |
+
return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}
|
src/rag_pipelines/evaluation/response/faithfulness.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from deepeval.metrics import FaithfulnessMetric
|
| 5 |
+
from deepeval.test_case import LLMTestCase
|
| 6 |
+
from weave import Scorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FaithfulnessScorer(Scorer):
|
| 10 |
+
"""Evaluate the faithfulness of LLM generated outputs.
|
| 11 |
+
|
| 12 |
+
This scorer uses DeepEval's `Faithfulness` Metric.
|
| 13 |
+
|
| 14 |
+
The faithfulness metric measures the quality of your LLM generation by evaluating whether the `actual_output` factually aligns with the contents of your `retrieval_context`.
|
| 15 |
+
|
| 16 |
+
Attributes:
|
| 17 |
+
threshold (float): The minimum score required to pass the faithfulness check, defaults to 0.5.
|
| 18 |
+
model (str): The LLM model used for evaluation, defaults to "gpt-4".
|
| 19 |
+
include_reason (bool): Whether to include an explanation for the assigned score, defaults to True.
|
| 20 |
+
strict_mode (bool): When True, enforces binary scoring (1 for perfect alignment, 0 otherwise).
|
| 21 |
+
Overrides the threshold to 1. Defaults to False.
|
| 22 |
+
async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
|
| 23 |
+
verbose (bool): Whether to display intermediate steps during metric computation, defaults to False.
|
| 24 |
+
truths_extraction_limit (Optional[int]): Limits the number of key facts to extract from the retrieval
|
| 25 |
+
context for evaluation, ordered by importance. Defaults to None.
|
| 26 |
+
metric (FaithfulnessMetric): An instance of DeepEval's `FaithfulnessMetric` for scoring.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
threshold: float = Optional[None]
|
| 30 |
+
model: str = Optional[None]
|
| 31 |
+
include_reason: bool = Optional[None]
|
| 32 |
+
strict_mode: bool = Optional[None]
|
| 33 |
+
async_mode: bool = Optional[None]
|
| 34 |
+
verbose: bool = Optional[None]
|
| 35 |
+
truths_extraction_limit: Optional[int] = Optional[None]
|
| 36 |
+
metric: FaithfulnessMetric = Optional[None]
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
threshold: float = 0.5,
|
| 41 |
+
model: str = "gpt-4",
|
| 42 |
+
include_reason: bool = True,
|
| 43 |
+
strict_mode: bool = False,
|
| 44 |
+
async_mode: bool = True,
|
| 45 |
+
verbose: bool = False,
|
| 46 |
+
truths_extraction_limit: Optional[int] = None,
|
| 47 |
+
):
|
| 48 |
+
"""Initialize the Faithfulness Scorer with DeepEval's Faithfulness Metric.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
threshold (float): The minimum score required to pass the faithfulness check, defaults to 0.5.
|
| 52 |
+
model (str): The LLM model used for evaluation, defaults to "gpt-4".
|
| 53 |
+
include_reason (bool): Whether to include an explanation for the assigned score, defaults to True.
|
| 54 |
+
strict_mode (bool): Enforces binary scoring (1 for perfect alignment, 0 otherwise).
|
| 55 |
+
Overrides the threshold to 1. Defaults to False.
|
| 56 |
+
async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
|
| 57 |
+
verbose (bool): Whether to display intermediate steps during metric computation, defaults to False.
|
| 58 |
+
truths_extraction_limit (Optional[int]): Limits the number of key facts to extract from the retrieval
|
| 59 |
+
context for evaluation, ordered by importance. Defaults to None.
|
| 60 |
+
"""
|
| 61 |
+
super().__init__(
|
| 62 |
+
threshold=threshold,
|
| 63 |
+
model=model,
|
| 64 |
+
include_reason=include_reason,
|
| 65 |
+
strict_mode=strict_mode,
|
| 66 |
+
async_mode=async_mode,
|
| 67 |
+
verbose=verbose,
|
| 68 |
+
truths_extraction_limit=truths_extraction_limit,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
self.threshold = threshold
|
| 72 |
+
self.model = model
|
| 73 |
+
self.include_reason = include_reason
|
| 74 |
+
self.strict_mode = strict_mode
|
| 75 |
+
self.async_mode = async_mode
|
| 76 |
+
self.verbose = verbose
|
| 77 |
+
self.truths_extraction_limit = truths_extraction_limit
|
| 78 |
+
|
| 79 |
+
self.metric = FaithfulnessMetric(
|
| 80 |
+
threshold=self.threshold,
|
| 81 |
+
model=self.model,
|
| 82 |
+
include_reason=self.include_reason,
|
| 83 |
+
async_mode=self.async_mode,
|
| 84 |
+
strict_mode=self.strict_mode,
|
| 85 |
+
verbose_mode=self.verbose,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
@weave.op
|
| 89 |
+
def score(
|
| 90 |
+
self,
|
| 91 |
+
input: str,
|
| 92 |
+
actual_output: str,
|
| 93 |
+
expected_output: Optional[str] = None,
|
| 94 |
+
retrieval_context: Optional[list[str]] = None,
|
| 95 |
+
context: Optional[list[str]] = None,
|
| 96 |
+
) -> dict[str, Union[str, float]]:
|
| 97 |
+
"""Evaluate the faithfulness of an LLM generated response.
|
| 98 |
+
|
| 99 |
+
Faithfulness is calculated as:
|
| 100 |
+
|
| 101 |
+
Faithfulness = (Number of Truthful Claims) / (Total Number of Claims).
|
| 102 |
+
|
| 103 |
+
The Faithfulness Metric evaluates all claims in the `actual_output` and checks
|
| 104 |
+
whether they are truthful based on the facts in the `retrieval_context`. Claims
|
| 105 |
+
are marked truthful if they align with or do not contradict any facts in the context.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
input (str): The input query or prompt that triggered the output.
|
| 109 |
+
actual_output (str): The LLM generated response to evaluate.
|
| 110 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 111 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 112 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 116 |
+
- "score" (float): The computed faithfulness score.
|
| 117 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 118 |
+
"""
|
| 119 |
+
test_case = LLMTestCase(
|
| 120 |
+
input=input,
|
| 121 |
+
actual_output=actual_output,
|
| 122 |
+
expected_output=expected_output,
|
| 123 |
+
retrieval_context=retrieval_context,
|
| 124 |
+
context=context,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
result: dict[str, Union[str, float]] = {}
|
| 128 |
+
|
| 129 |
+
self.metric.measure(test_case)
|
| 130 |
+
result = {"score": self.metric.score, "reason": self.metric.reason}
|
| 131 |
+
|
| 132 |
+
return result
|
src/rag_pipelines/evaluation/response/hallucination.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from deepeval.metrics import HallucinationMetric
|
| 5 |
+
from deepeval.test_case import LLMTestCase
|
| 6 |
+
from weave import Scorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class HallucinationScorer(Scorer):
|
| 10 |
+
"""Evaluate the factual alignment of the generated output with the provided context.
|
| 11 |
+
|
| 12 |
+
This scorer uses DeepEval's `Hallucination` Metric to assess how well the generated output
|
| 13 |
+
aligns with the reference context.
|
| 14 |
+
|
| 15 |
+
The Hallucination metric determines whether your LLM generates factually correct information by comparing the `actual_output` to the provided `context`.
|
| 16 |
+
|
| 17 |
+
Attributes:
|
| 18 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 19 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 20 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 21 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 22 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 23 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 24 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 25 |
+
to False.
|
| 26 |
+
metric (HallucinationMetric): The DeepEval HallucinationMetric.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
threshold: float = Optional[None]
|
| 30 |
+
model: str = Optional[None]
|
| 31 |
+
include_reason: bool = Optional[None]
|
| 32 |
+
strict_mode: bool = Optional[None]
|
| 33 |
+
async_mode: bool = Optional[None]
|
| 34 |
+
verbose: bool = Optional[None]
|
| 35 |
+
metric: HallucinationMetric = Optional[None]
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
threshold: float = 0.5,
|
| 40 |
+
model: str = "gpt-4",
|
| 41 |
+
include_reason: bool = True,
|
| 42 |
+
strict_mode: bool = True,
|
| 43 |
+
async_mode: bool = True,
|
| 44 |
+
verbose: bool = False,
|
| 45 |
+
):
|
| 46 |
+
"""Initialize the Hallucination scorer using DeepEval's Hallucination Metric.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 50 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 51 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 52 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 53 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 54 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 55 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 56 |
+
to False.
|
| 57 |
+
"""
|
| 58 |
+
super().__init__(
|
| 59 |
+
threshold=threshold,
|
| 60 |
+
model=model,
|
| 61 |
+
include_reason=include_reason,
|
| 62 |
+
strict_mode=strict_mode,
|
| 63 |
+
async_mode=async_mode,
|
| 64 |
+
verbose=verbose,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
self.threshold = threshold
|
| 68 |
+
self.model = model
|
| 69 |
+
self.include_reason = include_reason
|
| 70 |
+
self.strict_mode = strict_mode
|
| 71 |
+
self.async_mode = async_mode
|
| 72 |
+
self.verbose = verbose
|
| 73 |
+
|
| 74 |
+
self.metric = HallucinationMetric(
|
| 75 |
+
threshold=self.threshold,
|
| 76 |
+
model=self.model,
|
| 77 |
+
include_reason=self.include_reason,
|
| 78 |
+
async_mode=self.async_mode,
|
| 79 |
+
strict_mode=self.strict_mode,
|
| 80 |
+
verbose_mode=self.verbose,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
@weave.op
|
| 84 |
+
def score(
|
| 85 |
+
self,
|
| 86 |
+
input: str,
|
| 87 |
+
actual_output: str,
|
| 88 |
+
expected_output: Optional[str] = None,
|
| 89 |
+
retrieval_context: Optional[list[str]] = None,
|
| 90 |
+
context: Optional[list[str]] = None,
|
| 91 |
+
) -> dict[str, Union[str, float]]:
|
| 92 |
+
"""Evaluate the factual alignment of the generated output with the provided context.
|
| 93 |
+
|
| 94 |
+
The Hallucination Score is calculated according to the following equation:
|
| 95 |
+
|
| 96 |
+
Hallucination = Number of Contradicted Contexts / Total Number of Contexts
|
| 97 |
+
|
| 98 |
+
The Hallucination Score uses an LLM to determine, for each context in `contexts`, whether there are any contradictions to the `actual_output`.
|
| 99 |
+
|
| 100 |
+
Although extremely similar to the Faithfulness Scorer, the Hallucination Score is calculated differently since it uses `contexts` as the source of truth instead. Since `contexts` is the ideal segment of your knowledge base relevant to a specific input, the degree of hallucination can be measured by the degree of which the `contexts` is disagreed upon.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
input (str): The input query or prompt that triggered the output.
|
| 104 |
+
actual_output (str): The LLM generated response to evaluate.
|
| 105 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 106 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 107 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 111 |
+
- "score" (float): The computed hallucination score.
|
| 112 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 113 |
+
"""
|
| 114 |
+
test_case = LLMTestCase(
|
| 115 |
+
input=input,
|
| 116 |
+
actual_output=actual_output,
|
| 117 |
+
expected_output=expected_output,
|
| 118 |
+
retrieval_context=retrieval_context,
|
| 119 |
+
context=context,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
result: dict[str, Union[str, float]] = {}
|
| 123 |
+
|
| 124 |
+
self.metric.measure(test_case)
|
| 125 |
+
result = {"score": self.metric.score, "reason": self.metric.reason}
|
| 126 |
+
|
| 127 |
+
return result
|
src/rag_pipelines/evaluation/response/phoenix_hallucination.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from deepeval.metrics import AnswerRelevancyMetric
|
| 5 |
+
from deepeval.test_case import LLMTestCase
|
| 6 |
+
from weave import Scorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AnswerRelevancyScorer(Scorer):
|
| 10 |
+
"""Evaluate the relevancy of answers generated by a LLM.
|
| 11 |
+
|
| 12 |
+
This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
|
| 13 |
+
compared to the input query.
|
| 14 |
+
|
| 15 |
+
The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
|
| 16 |
+
actual output of an LLM application is in relation to the input query.
|
| 17 |
+
|
| 18 |
+
Attributes:
|
| 19 |
+
threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
|
| 20 |
+
model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
|
| 21 |
+
include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
|
| 22 |
+
strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
|
| 23 |
+
1. Defaults to False.
|
| 24 |
+
async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
|
| 25 |
+
verbose (bool): Whether to print intermediate steps to the console, defaults to False.
|
| 26 |
+
metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
threshold: float = 0.5,
|
| 32 |
+
model: str = "gpt-4",
|
| 33 |
+
include_reason: bool = True,
|
| 34 |
+
strict_mode: bool = False,
|
| 35 |
+
async_mode: bool = True,
|
| 36 |
+
verbose: bool = False,
|
| 37 |
+
):
|
| 38 |
+
"""Initialize the AnswerRelevancy Scorer with the specified parameters.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
|
| 42 |
+
model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
|
| 43 |
+
include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
|
| 44 |
+
strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
|
| 45 |
+
async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
|
| 46 |
+
verbose (bool): Whether to print intermediate steps to the console, defaults to False.
|
| 47 |
+
"""
|
| 48 |
+
self.threshold = threshold
|
| 49 |
+
self.model = model
|
| 50 |
+
self.include_reason = include_reason
|
| 51 |
+
self.strict_mode = strict_mode
|
| 52 |
+
self.async_mode = async_mode
|
| 53 |
+
self.verbose = verbose
|
| 54 |
+
|
| 55 |
+
self.metric = AnswerRelevancyMetric(
|
| 56 |
+
threshold=self.threshold,
|
| 57 |
+
model=self.model,
|
| 58 |
+
include_reason=self.include_reason,
|
| 59 |
+
async_mode=self.async_mode,
|
| 60 |
+
strict_mode=self.strict_mode,
|
| 61 |
+
verbose=self.verbose,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
@weave.op
|
| 65 |
+
def score(
|
| 66 |
+
self,
|
| 67 |
+
input: str,
|
| 68 |
+
actual_output: str,
|
| 69 |
+
expected_output: Optional[str] = None,
|
| 70 |
+
retrieval_context: Optional[list[str]] = None,
|
| 71 |
+
context: Optional[list[str]] = None,
|
| 72 |
+
) -> dict[str, Union[str, float]]:
|
| 73 |
+
"""Evaluate the relevancy and accuracy of answers generated by a LLM.
|
| 74 |
+
|
| 75 |
+
The AnswerRelevancy score is calculated according to the following equation:
|
| 76 |
+
|
| 77 |
+
Answer Relevancy = Total Number of Statements / Number of Relevant Statements
|
| 78 |
+
|
| 79 |
+
The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
input (str): The input query or prompt that triggered the output.
|
| 84 |
+
actual_output (str): The LLM generated response to evaluate.
|
| 85 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 86 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 87 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 91 |
+
- "score" (float): The computed answer relevancy score.
|
| 92 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 93 |
+
"""
|
| 94 |
+
test_case = LLMTestCase(
|
| 95 |
+
input=input,
|
| 96 |
+
actual_output=actual_output,
|
| 97 |
+
expected_output=expected_output,
|
| 98 |
+
retrieval_context=retrieval_context,
|
| 99 |
+
context=context,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
result: dict[str, Union[str, float]] = {}
|
| 103 |
+
|
| 104 |
+
self.metric.measure(test_case)
|
| 105 |
+
result = {"score": self.metric.score, "reason": self.metric.reason}
|
| 106 |
+
|
| 107 |
+
return result
|
src/rag_pipelines/evaluation/response/summarization.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from deepeval.metrics import SummarizationMetric
|
| 5 |
+
from deepeval.test_case import LLMTestCase
|
| 6 |
+
from weave import Scorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SummarizationScorer(Scorer):
|
| 10 |
+
"""Summarization Scorer.
|
| 11 |
+
|
| 12 |
+
This scorer uses DeepEval's `Summarization` Metric to assess how well the generated output
|
| 13 |
+
aligns with the reference context.
|
| 14 |
+
|
| 15 |
+
The summarization metric uses LLMs to determine whether the LLM application is generating factually correct
|
| 16 |
+
summaries while including the neccessary details from the original text.
|
| 17 |
+
|
| 18 |
+
Attributes:
|
| 19 |
+
threshold (float): Minimum passing threshold, defaults to 0.5.
|
| 20 |
+
model (str): LLM model for scoring, defaults to "gpt-4".
|
| 21 |
+
assessment_questions: a list of close-ended questions that can be answered with either a 'yes' or a 'no'.
|
| 22 |
+
These are questions you want your summary to be able to ideally answer,
|
| 23 |
+
and is especially helpful if you already know what a good summary for your use case looks like. If
|
| 24 |
+
include_reason (bool): Include reason for the evaluation score, defaults to True.
|
| 25 |
+
strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
|
| 26 |
+
async_mode (bool): Use asynchronous scoring, defaults to True.
|
| 27 |
+
verbose (bool): Print intermediate steps used for scoring, defaults to False.
|
| 28 |
+
truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
|
| 29 |
+
from the retrieval_context. Defaults to None.
|
| 30 |
+
metric (SummarizationMetric): An instance of DeepEval's `SummarizationMetric` for scoring.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
threshold: float = Optional[None]
|
| 34 |
+
model: str = Optional[None]
|
| 35 |
+
include_reason: bool = Optional[None]
|
| 36 |
+
strict_mode: bool = Optional[None]
|
| 37 |
+
async_mode: bool = Optional[None]
|
| 38 |
+
verbose: bool = Optional[None]
|
| 39 |
+
assessment_questions: Optional[list[str]] = Optional[None]
|
| 40 |
+
n: Optional[int] = Optional[None]
|
| 41 |
+
truths_extraction_limit: Optional[int] = Optional[None]
|
| 42 |
+
metric: SummarizationMetric = Optional[None]
|
| 43 |
+
|
| 44 |
+
def __init__(
|
| 45 |
+
self,
|
| 46 |
+
threshold: float = 0.5,
|
| 47 |
+
model: str = "gpt-4",
|
| 48 |
+
include_reason: bool = True,
|
| 49 |
+
strict_mode: bool = False,
|
| 50 |
+
async_mode: bool = True,
|
| 51 |
+
verbose: bool = False,
|
| 52 |
+
assessment_questions: Optional[list[str]] = None,
|
| 53 |
+
n: Optional[int] = 5,
|
| 54 |
+
truths_extraction_limit: Optional[int] = None,
|
| 55 |
+
):
|
| 56 |
+
"""Initialize the Summarization Scorer with DeepEval's Summarization Metric.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
threshold (float): Minimum passing threshold, defaults to 0.5.
|
| 60 |
+
model (str): LLM model for scoring, defaults to "gpt-4".
|
| 61 |
+
include_reason (bool): Include reason for the evaluation score, defaults to True.
|
| 62 |
+
strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
|
| 63 |
+
async_mode (bool): Use asynchronous scoring, defaults to True.
|
| 64 |
+
verbose (bool): Print intermediate steps used for scoring, defaults to False.
|
| 65 |
+
assessment_questions (Optional[list[str]]): a list of close-ended questions that can be answered with either
|
| 66 |
+
a 'yes' or a 'no'. These are questions you want your summary to be able to ideally answer, and is
|
| 67 |
+
especially helpful if you already know what a good summary for your use case looks like. If
|
| 68 |
+
`assessment_questions` is not provided, the metric will generate a set of `assessment_questions` at
|
| 69 |
+
evaluation time.
|
| 70 |
+
n (Optional[int]): The number of assessment questions to generate when `assessment_questions` is not
|
| 71 |
+
provided. Defaults to 5.
|
| 72 |
+
truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
|
| 73 |
+
from the retrieval_context. Defaults to None.
|
| 74 |
+
"""
|
| 75 |
+
super().__init__(
|
| 76 |
+
threshold=threshold,
|
| 77 |
+
model=model,
|
| 78 |
+
include_reason=include_reason,
|
| 79 |
+
strict_mode=strict_mode,
|
| 80 |
+
async_mode=async_mode,
|
| 81 |
+
verbose=verbose,
|
| 82 |
+
assessment_questions=assessment_questions,
|
| 83 |
+
n=n,
|
| 84 |
+
truths_extraction_limit=truths_extraction_limit,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self.threshold = threshold
|
| 88 |
+
self.model = model
|
| 89 |
+
self.include_reason = include_reason
|
| 90 |
+
self.strict_mode = strict_mode
|
| 91 |
+
self.async_mode = async_mode
|
| 92 |
+
self.verbose = verbose
|
| 93 |
+
self.assessment_questions = assessment_questions
|
| 94 |
+
self.n = n
|
| 95 |
+
self.truths_extraction_limit = truths_extraction_limit
|
| 96 |
+
|
| 97 |
+
self.metric = SummarizationMetric(
|
| 98 |
+
threshold=self.threshold,
|
| 99 |
+
model=self.model,
|
| 100 |
+
include_reason=self.include_reason,
|
| 101 |
+
async_mode=self.async_mode,
|
| 102 |
+
strict_mode=self.strict_mode,
|
| 103 |
+
verbose_mode=self.verbose,
|
| 104 |
+
assessment_questions=self.assessment_questions,
|
| 105 |
+
n=self.n,
|
| 106 |
+
truths_extraction_limit=self.truths_extraction_limit,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
@weave.op
|
| 110 |
+
def score(
|
| 111 |
+
self,
|
| 112 |
+
input: str,
|
| 113 |
+
actual_output: str,
|
| 114 |
+
expected_output: Optional[str] = None,
|
| 115 |
+
retrieval_context: Optional[list[str]] = None,
|
| 116 |
+
context: Optional[list[str]] = None,
|
| 117 |
+
) -> dict[str, Union[str, float]]:
|
| 118 |
+
"""Evaluate the quality of summarization of an LLM generated response.
|
| 119 |
+
|
| 120 |
+
The Summarization score is calculated according to the following equation:
|
| 121 |
+
|
| 122 |
+
Summarization = min(Alignment Score, Coverage Score)
|
| 123 |
+
|
| 124 |
+
where,
|
| 125 |
+
- Alignment Score: determines whether the summary contains hallucinated or contradictory information to the original text.
|
| 126 |
+
- Coverage Score: determines whether the summary contains the neccessary information from the original text.
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
While the Alignment Score is similar to that of the Hallucination Score, the Coverage Score is first calculated
|
| 130 |
+
by generating n closed-ended questions that can only be answered with either a 'yes or a 'no', before
|
| 131 |
+
calculating the ratio of which the original text and summary yields the same answer.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
input (str): The input query or prompt that triggered the output.
|
| 135 |
+
actual_output (str): The LLM generated response to evaluate.
|
| 136 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 137 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 138 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 142 |
+
- "score" (float): The computed summarization score.
|
| 143 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 144 |
+
"""
|
| 145 |
+
test_case = LLMTestCase(
|
| 146 |
+
input=input,
|
| 147 |
+
actual_output=actual_output,
|
| 148 |
+
expected_output=expected_output,
|
| 149 |
+
retrieval_context=retrieval_context,
|
| 150 |
+
context=context,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
result: dict[str, Union[str, float]] = {}
|
| 154 |
+
|
| 155 |
+
self.metric.measure(test_case)
|
| 156 |
+
result = {"score": self.metric.score, "reason": self.metric.reason}
|
| 157 |
+
|
| 158 |
+
return result
|
src/rag_pipelines/evaluation/retrieval/__init__.py
ADDED
|
File without changes
|
src/rag_pipelines/evaluation/retrieval/contextual_precision.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import weave
|
| 5 |
+
from deepeval.metrics import ContextualPrecisionMetric
|
| 6 |
+
from deepeval.test_case import LLMTestCase
|
| 7 |
+
from weave import Scorer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ContextualPrecisionScorer(Scorer):
|
| 11 |
+
"""Evaluate the contextual precision of the generated output with the provided context.
|
| 12 |
+
|
| 13 |
+
This scorer uses DeepEval's `Contextual Precision` Metric to assess how well the generated output
|
| 14 |
+
aligns with the reference context.
|
| 15 |
+
|
| 16 |
+
The contextual precision metric measures the quality of the pipeline's retriever by evaluating whether results in the `retrieval_context` that are relevant to the given input are ranked higher than irrelevant ones.
|
| 17 |
+
|
| 18 |
+
Attributes:
|
| 19 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 20 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 21 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 22 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 23 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 24 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 25 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 26 |
+
to False.
|
| 27 |
+
metric (ContextualPrecisionMetric): The DeepEval ContextualPrecisionMetric.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
threshold: float = Optional[None]
|
| 31 |
+
model: str = Optional[None]
|
| 32 |
+
include_reason: bool = Optional[None]
|
| 33 |
+
strict_mode: bool = Optional[None]
|
| 34 |
+
async_mode: bool = Optional[None]
|
| 35 |
+
verbose: bool = Optional[None]
|
| 36 |
+
metric: ContextualPrecisionMetric = Optional[None]
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
threshold: float = 0.5,
|
| 41 |
+
model: str = "gpt-4",
|
| 42 |
+
include_reason: bool = True,
|
| 43 |
+
strict_mode: bool = True,
|
| 44 |
+
async_mode: bool = True,
|
| 45 |
+
verbose: bool = False,
|
| 46 |
+
):
|
| 47 |
+
"""Initialize the Contextual Precision Scorer using DeepEval's Contextual Precision Metric.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 51 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 52 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 53 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 54 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 55 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 56 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 57 |
+
to False.
|
| 58 |
+
"""
|
| 59 |
+
super().__init__(
|
| 60 |
+
threshold=threshold,
|
| 61 |
+
model=model,
|
| 62 |
+
include_reason=include_reason,
|
| 63 |
+
strict_mode=strict_mode,
|
| 64 |
+
async_mode=async_mode,
|
| 65 |
+
verbose=verbose,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
self.threshold = threshold
|
| 69 |
+
self.model = model
|
| 70 |
+
self.include_reason = include_reason
|
| 71 |
+
self.strict_mode = strict_mode
|
| 72 |
+
self.async_mode = async_mode
|
| 73 |
+
self.verbose = verbose
|
| 74 |
+
|
| 75 |
+
self.metric = ContextualPrecisionMetric(
|
| 76 |
+
threshold=self.threshold,
|
| 77 |
+
model=self.model,
|
| 78 |
+
include_reason=self.include_reason,
|
| 79 |
+
async_mode=self.async_mode,
|
| 80 |
+
strict_mode=self.strict_mode,
|
| 81 |
+
verbose_mode=self.verbose,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
@weave.op
|
| 85 |
+
def score(
|
| 86 |
+
self,
|
| 87 |
+
input: str,
|
| 88 |
+
output: Optional[dict] = None,
|
| 89 |
+
expected_output: Optional[str] = None,
|
| 90 |
+
context: Optional[list[str]] = None,
|
| 91 |
+
) -> dict[str, Union[str, float]]:
|
| 92 |
+
"""Evaluate the contextual precision of the generated output with the provided context.
|
| 93 |
+
|
| 94 |
+
The Contextual Precision Score is calculated according to the following equation:
|
| 95 |
+
|
| 96 |
+
Contextual Precision = (1 / Number of Relevant Results) * (Sum(Number of Relevant Results up to position k) / k) * Binary Relevance of k'th result)
|
| 97 |
+
|
| 98 |
+
where,
|
| 99 |
+
- k: The position of the result in the list of all results.
|
| 100 |
+
|
| 101 |
+
The Contextual Precision Scorer first uses an LLM to determine for each result in the `retrieval_context`
|
| 102 |
+
whether it is relevant to the input based on information in the `expected_output`, before calculating the
|
| 103 |
+
weighted cumulative precision as the contextual precision score.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
input (str): The input query or prompt that triggered the output.
|
| 107 |
+
output (str): The LLM generated response to evaluate.
|
| 108 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 109 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 110 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 114 |
+
- "score" (float): The computed contextual precision score.
|
| 115 |
+
"""
|
| 116 |
+
test_case = LLMTestCase(
|
| 117 |
+
input=input,
|
| 118 |
+
actual_output=output.get("output", ""),
|
| 119 |
+
expected_output=expected_output,
|
| 120 |
+
retrieval_context=output.get("retrieval_context", [""]),
|
| 121 |
+
context=context,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
result: dict[str, Union[str, float]] = {}
|
| 125 |
+
|
| 126 |
+
self.metric.measure(test_case)
|
| 127 |
+
result = {
|
| 128 |
+
"score": self.metric.score,
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
return result
|
| 132 |
+
|
| 133 |
+
@weave.op()
|
| 134 |
+
def summarize(self, score_rows: list) -> dict:
|
| 135 |
+
"""Summarize the results of the Contextual Precision Scorer.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
score_rows (list): A list of dictionaries containing the following keys:
|
| 139 |
+
- "score" (float): The computed answer relevancy score.
|
| 140 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
dict: A dictionary containing the following keys:
|
| 144 |
+
- "answer_relevancy_score" (dict): A dictionary containing the following keys:
|
| 145 |
+
- "score" (float): The average answer relevancy score.
|
| 146 |
+
- "variance" (float): The variance of the answer relevancy scores.
|
| 147 |
+
- "std" (float): The standard deviation of the answer relevancy scores.
|
| 148 |
+
- "count" (int): The number of answer relevancy scores.
|
| 149 |
+
"""
|
| 150 |
+
scores = []
|
| 151 |
+
for row in score_rows:
|
| 152 |
+
score = row.get("score", 0.0)
|
| 153 |
+
scores.append(float(score))
|
| 154 |
+
|
| 155 |
+
score = np.mean(scores).item()
|
| 156 |
+
variance = np.var(scores).item()
|
| 157 |
+
std = np.std(scores).item()
|
| 158 |
+
count = len(scores)
|
| 159 |
+
|
| 160 |
+
return {"contextual_precision_score": {"score": score, "variance": variance, "std": std, "count": count}}
|
src/rag_pipelines/evaluation/retrieval/contextual_recall.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from deepeval.metrics import ContextualRecallMetric
|
| 5 |
+
from deepeval.test_case import LLMTestCase
|
| 6 |
+
from weave import Scorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ContextualRecallScorer(Scorer):
|
| 10 |
+
"""Evaluate the contextual recall of the generated output with the provided context.
|
| 11 |
+
|
| 12 |
+
This scorer uses DeepEval's `ContextualRecall` Metric to assess how well the generated output
|
| 13 |
+
aligns with the reference context.
|
| 14 |
+
|
| 15 |
+
The contextual recall metric measures the quality of the pipeline's retriever by evaluating the extent of which the `retrieval_context` aligns with the `expected_output`.
|
| 16 |
+
|
| 17 |
+
Attributes:
|
| 18 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 19 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 20 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 21 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 22 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 23 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 24 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 25 |
+
to False.
|
| 26 |
+
metric (ContextualRecallMetric): The DeepEval ContextualRecallMetric.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
threshold: float = Optional[None]
|
| 30 |
+
model: str = Optional[None]
|
| 31 |
+
include_reason: bool = Optional[None]
|
| 32 |
+
strict_mode: bool = Optional[None]
|
| 33 |
+
async_mode: bool = Optional[None]
|
| 34 |
+
verbose: bool = Optional[None]
|
| 35 |
+
metric: ContextualRecallMetric = Optional[None]
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
threshold: float = 0.5,
|
| 40 |
+
model: str = "gpt-4",
|
| 41 |
+
include_reason: bool = True,
|
| 42 |
+
strict_mode: bool = True,
|
| 43 |
+
async_mode: bool = True,
|
| 44 |
+
verbose: bool = False,
|
| 45 |
+
):
|
| 46 |
+
"""Initialize the Contextual Recall Scorer using DeepEval's Contextual Recall Metric.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 50 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 51 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 52 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 53 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 54 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 55 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 56 |
+
to False.
|
| 57 |
+
"""
|
| 58 |
+
super().__init__(
|
| 59 |
+
threshold=threshold,
|
| 60 |
+
model=model,
|
| 61 |
+
include_reason=include_reason,
|
| 62 |
+
strict_mode=strict_mode,
|
| 63 |
+
async_mode=async_mode,
|
| 64 |
+
verbose=verbose,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
self.threshold = threshold
|
| 68 |
+
self.model = model
|
| 69 |
+
self.include_reason = include_reason
|
| 70 |
+
self.strict_mode = strict_mode
|
| 71 |
+
self.async_mode = async_mode
|
| 72 |
+
self.verbose = verbose
|
| 73 |
+
|
| 74 |
+
self.metric = ContextualRecallMetric(
|
| 75 |
+
threshold=self.threshold,
|
| 76 |
+
model=self.model,
|
| 77 |
+
include_reason=self.include_reason,
|
| 78 |
+
async_mode=self.async_mode,
|
| 79 |
+
strict_mode=self.strict_mode,
|
| 80 |
+
verbose_mode=self.verbose,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
@weave.op
|
| 84 |
+
def score(
|
| 85 |
+
self,
|
| 86 |
+
input: str,
|
| 87 |
+
actual_output: str,
|
| 88 |
+
expected_output: Optional[str] = None,
|
| 89 |
+
retrieval_context: Optional[list[str]] = None,
|
| 90 |
+
context: Optional[list[str]] = None,
|
| 91 |
+
) -> dict[str, Union[str, float]]:
|
| 92 |
+
"""Evaluate the contextual recall of the generated output with the provided context.
|
| 93 |
+
|
| 94 |
+
The Contextual Recall Score is calculated according to the following equation:
|
| 95 |
+
|
| 96 |
+
Contextual Recall = Number of Attributable Results / Total Number of Results
|
| 97 |
+
|
| 98 |
+
he Contextual Recall Scorer first uses an LLM to extract all statements made in the `expected_output`, before using the same LLM to classify whether each statement can be attributed to results in the `retrieval_context`.
|
| 99 |
+
|
| 100 |
+
A higher contextual recall score represents a greater ability of the retrieval system to capture all relevant information from the total available relevant set within your knowledge base.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
input (str): The input query or prompt that triggered the output.
|
| 104 |
+
actual_output (str): The LLM generated response to evaluate.
|
| 105 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 106 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 107 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 111 |
+
- "score" (float): The computed contextual recall score.
|
| 112 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 113 |
+
"""
|
| 114 |
+
test_case = LLMTestCase(
|
| 115 |
+
input=input,
|
| 116 |
+
actual_output=actual_output,
|
| 117 |
+
expected_output=expected_output,
|
| 118 |
+
retrieval_context=retrieval_context,
|
| 119 |
+
context=context,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
result: dict[str, Union[str, float]] = {}
|
| 123 |
+
|
| 124 |
+
self.metric.measure(test_case)
|
| 125 |
+
result = {"score": self.metric.score, "reason": self.metric.reason}
|
| 126 |
+
|
| 127 |
+
return result
|
src/rag_pipelines/evaluation/retrieval/contextual_relevancy.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import weave
|
| 4 |
+
from deepeval.metrics import ContextualRelevancyMetric
|
| 5 |
+
from deepeval.test_case import LLMTestCase
|
| 6 |
+
from weave import Scorer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ContextualRelevancyScorer(Scorer):
|
| 10 |
+
"""Evaluate the contextual relevancy of the generated output with the provided context.
|
| 11 |
+
|
| 12 |
+
This scorer uses DeepEval's `ContextualRelevancy` Metric to assess how well the generated output
|
| 13 |
+
aligns with the reference context.
|
| 14 |
+
|
| 15 |
+
The contextual relevancy metric measures the quality of the RAG pipeline's retriever by evaluating the overall relevance of the information presented in the `retrieval_context` for a given input.
|
| 16 |
+
|
| 17 |
+
Attributes:
|
| 18 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 19 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 20 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 21 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 22 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 23 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 24 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 25 |
+
to False.
|
| 26 |
+
metric (ContextualRelevancyMetric): The DeepEval ContextualRelevancyMetric.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
threshold: float = Optional[None]
|
| 30 |
+
model: str = Optional[None]
|
| 31 |
+
include_reason: bool = Optional[None]
|
| 32 |
+
strict_mode: bool = Optional[None]
|
| 33 |
+
async_mode: bool = Optional[None]
|
| 34 |
+
verbose: bool = Optional[None]
|
| 35 |
+
metric: ContextualRelevancyMetric = Optional[None]
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
threshold: float = 0.5,
|
| 40 |
+
model: str = "gpt-4",
|
| 41 |
+
include_reason: bool = True,
|
| 42 |
+
strict_mode: bool = True,
|
| 43 |
+
async_mode: bool = True,
|
| 44 |
+
verbose: bool = False,
|
| 45 |
+
):
|
| 46 |
+
"""Initialize the Contextual Relevancy Scorer using DeepEval's Contextual Relevancy Metric.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
|
| 50 |
+
model (str): The LLM model to use for scoring, defaults to "gpt-4".
|
| 51 |
+
include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
|
| 52 |
+
strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
|
| 53 |
+
0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
|
| 54 |
+
async_mode (bool): Whether to use asynchronous scoring, defaults to True.
|
| 55 |
+
verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
|
| 56 |
+
to False.
|
| 57 |
+
"""
|
| 58 |
+
super().__init__(
|
| 59 |
+
threshold=threshold,
|
| 60 |
+
model=model,
|
| 61 |
+
include_reason=include_reason,
|
| 62 |
+
strict_mode=strict_mode,
|
| 63 |
+
async_mode=async_mode,
|
| 64 |
+
verbose=verbose,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
self.threshold = threshold
|
| 68 |
+
self.model = model
|
| 69 |
+
self.include_reason = include_reason
|
| 70 |
+
self.strict_mode = strict_mode
|
| 71 |
+
self.async_mode = async_mode
|
| 72 |
+
self.verbose = verbose
|
| 73 |
+
|
| 74 |
+
self.metric = ContextualRelevancyMetric(
|
| 75 |
+
threshold=self.threshold,
|
| 76 |
+
model=self.model,
|
| 77 |
+
include_reason=self.include_reason,
|
| 78 |
+
async_mode=self.async_mode,
|
| 79 |
+
strict_mode=self.strict_mode,
|
| 80 |
+
verbose_mode=self.verbose,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
@weave.op
|
| 84 |
+
def score(
|
| 85 |
+
self,
|
| 86 |
+
input: str,
|
| 87 |
+
actual_output: str,
|
| 88 |
+
expected_output: Optional[str] = None,
|
| 89 |
+
retrieval_context: Optional[list[str]] = None,
|
| 90 |
+
context: Optional[list[str]] = None,
|
| 91 |
+
) -> dict[str, Union[str, float]]:
|
| 92 |
+
"""Evaluate the contextual relevancy of the generated output with the provided context.
|
| 93 |
+
|
| 94 |
+
The Contextual Relevancy Score is calculated according to the following equation:
|
| 95 |
+
|
| 96 |
+
Contextual Relevancy = Number of Relevant Results / Total Number of Results
|
| 97 |
+
|
| 98 |
+
Although similar to how the Answer Relevancy Score is calculated, the Contextual Relevancy Metric first uses an LLM to extract all statements made in the `retrieval_context` instead, before using the same LLM to classify whether each statement is relevant to the input.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
input (str): The input query or prompt that triggered the output.
|
| 102 |
+
actual_output (str): The LLM generated response to evaluate.
|
| 103 |
+
expected_output (Optional[str]): The expected or reference output, defaults to None.
|
| 104 |
+
retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
|
| 105 |
+
context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
dict[str, Union[str, float]]: A dictionary containing:
|
| 109 |
+
- "score" (float): The computed contextual relevancy score.
|
| 110 |
+
- "reason" (str): A detailed explanation for the assigned score.
|
| 111 |
+
"""
|
| 112 |
+
test_case = LLMTestCase(
|
| 113 |
+
input=input,
|
| 114 |
+
actual_output=actual_output,
|
| 115 |
+
expected_output=expected_output,
|
| 116 |
+
retrieval_context=retrieval_context,
|
| 117 |
+
context=context,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
result: dict[str, Union[str, float]] = {}
|
| 121 |
+
|
| 122 |
+
self.metric.measure(test_case)
|
| 123 |
+
result = {"score": self.metric.score, "reason": self.metric.reason}
|
| 124 |
+
|
| 125 |
+
return result
|
src/rag_pipelines/llms/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag_pipelines.llms.groq import ChatGroqGenerator
|
| 2 |
+
|
| 3 |
+
__all__ = ["ChatGroqGenerator"]
|
src/rag_pipelines/llms/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (256 Bytes). View file
|
|
|
src/rag_pipelines/llms/__pycache__/groq.cpython-310.pyc
ADDED
|
Binary file (3.63 kB). View file
|
|
|
src/rag_pipelines/llms/groq.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Optional
|
| 3 |
+
|
| 4 |
+
import weave
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
from langchain_groq import ChatGroq
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
|
| 9 |
+
from rag_pipelines.prompts import STRUCTURED_RAG_PROMPT, RAGResponseModel
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ChatGroqGenerator:
|
| 13 |
+
"""Interact with the ChatGroq model to generate responses based on user queries and documents.
|
| 14 |
+
|
| 15 |
+
This class provides an interface for generating responses using the ChatGroq model.
|
| 16 |
+
It handles prompt formatting, LLM invocation, document integration, and result generation.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
model: str
|
| 20 |
+
api_key: str
|
| 21 |
+
llm_params: dict[str, Any]
|
| 22 |
+
llm: Optional[ChatGroq] = None
|
| 23 |
+
structured_output_model: BaseModel
|
| 24 |
+
system_prompt: str
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
model: str,
|
| 29 |
+
api_key: Optional[str] = None,
|
| 30 |
+
llm_params: Optional[dict[str, Any]] = None,
|
| 31 |
+
structured_output_model: BaseModel = RAGResponseModel,
|
| 32 |
+
system_prompt: str = STRUCTURED_RAG_PROMPT,
|
| 33 |
+
):
|
| 34 |
+
"""Initialize the ChatGroqGenerator with configuration parameters.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
model (str): The name of the ChatGroq model to use.
|
| 38 |
+
api_key (Optional[str]): API key for the ChatGroq service. If not provided,
|
| 39 |
+
the "GROQ_API_KEY" environment variable will be used.
|
| 40 |
+
llm_params (Optional[dict]): Additional parameters for configuring the ChatGroq model.
|
| 41 |
+
structured_output_model (BaseModel): The output model for structured responses.
|
| 42 |
+
system_prompt (str): The system prompt for the ChatGroq model.
|
| 43 |
+
|
| 44 |
+
Raises:
|
| 45 |
+
ValueError: If the API key is not provided and the "GROQ_API_KEY" environment variable is not set.
|
| 46 |
+
"""
|
| 47 |
+
if llm_params is None:
|
| 48 |
+
llm_params = {}
|
| 49 |
+
|
| 50 |
+
api_key = api_key or os.environ.get("GROQ_API_KEY")
|
| 51 |
+
if api_key is None:
|
| 52 |
+
msg = "GROQ_API_KEY is not set. Please provide an API key or set it as an environment variable."
|
| 53 |
+
raise ValueError(msg)
|
| 54 |
+
|
| 55 |
+
self.model = model
|
| 56 |
+
self.api_key = api_key
|
| 57 |
+
self.llm_params = llm_params
|
| 58 |
+
|
| 59 |
+
self.structured_output_model = structured_output_model
|
| 60 |
+
self.system_prompt = system_prompt
|
| 61 |
+
|
| 62 |
+
self.llm = ChatGroq(model=self.model, api_key=self.api_key, **llm_params)
|
| 63 |
+
|
| 64 |
+
@weave.op()
|
| 65 |
+
def __call__(self, state: dict[str, Any]) -> dict[str, Any]:
|
| 66 |
+
"""Generate a response using the current state of user prompts and graded documents.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
state (dict[str, Any]): The current state, containing:
|
| 70 |
+
- 'question': The user question.
|
| 71 |
+
- 'context': A list of filtered document texts.
|
| 72 |
+
- 'documents': A list of retrieved documents.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
dict[str, Any]: A dictionary containing:
|
| 76 |
+
- 'question': The user question.
|
| 77 |
+
- 'context': A list of filtered document texts.
|
| 78 |
+
- 'documents': A list of retrieved documents.
|
| 79 |
+
- 'answer': The generated response.
|
| 80 |
+
"""
|
| 81 |
+
question = state["question"]
|
| 82 |
+
context = state["context"]
|
| 83 |
+
documents = state["documents"]
|
| 84 |
+
|
| 85 |
+
formatted_context = "\n".join(context)
|
| 86 |
+
|
| 87 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 88 |
+
[
|
| 89 |
+
("system", self.system_prompt),
|
| 90 |
+
]
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
response_chain = prompt | self.llm.with_structured_output(self.structured_output_model)
|
| 94 |
+
|
| 95 |
+
response = response_chain.invoke({"question": question, "context": formatted_context})
|
| 96 |
+
|
| 97 |
+
answer = response.final_answer
|
| 98 |
+
|
| 99 |
+
return {"question": question, "context": context, "documents": documents, "answer": answer}
|
src/rag_pipelines/pipelines/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag_pipelines.pipelines.self_rag import SelfRAGPipeline
|
| 2 |
+
|
| 3 |
+
__all__ = ["SelfRAGPipeline"]
|
src/rag_pipelines/pipelines/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (268 Bytes). View file
|
|
|
src/rag_pipelines/pipelines/__pycache__/self_rag.cpython-310.pyc
ADDED
|
Binary file (5.1 kB). View file
|
|
|
src/rag_pipelines/pipelines/__pycache__/self_rag_graph_state.cpython-310.pyc
ADDED
|
Binary file (1.09 kB). View file
|
|
|
src/rag_pipelines/pipelines/adaptive_rag.py
ADDED
|
File without changes
|
src/rag_pipelines/pipelines/adaptive_rag_graph_state.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.documents import Document
|
| 2 |
+
from typing_extensions import TypedDict
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class AdaptiveRAGGraphState(TypedDict):
|
| 6 |
+
"""Represents the state of the graph for the Adaptive Retrieval-Augmentation-Generation (Adaptive-RAG) pipeline.
|
| 7 |
+
|
| 8 |
+
Attributes:
|
| 9 |
+
question (str): The input question for the pipeline.
|
| 10 |
+
answer (str): The generated response from the LLM.
|
| 11 |
+
documents (list[Document]): A list of LangChain documents that are retrieved and processed through the pipeline.
|
| 12 |
+
context (list[str]): The final list of context documents passed to the LLM for generating the answer.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
question: str
|
| 16 |
+
answer: str
|
| 17 |
+
documents: list[Document]
|
| 18 |
+
context: list[str]
|
src/rag_pipelines/pipelines/crag.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Optional
|
| 3 |
+
|
| 4 |
+
import weave
|
| 5 |
+
from langchain_community.retrievers import PineconeHybridSearchRetriever
|
| 6 |
+
from langchain_core.prompts.chat import ChatPromptTemplate
|
| 7 |
+
from langgraph.graph import END, START, StateGraph
|
| 8 |
+
from langgraph.graph.state import CompiledStateGraph
|
| 9 |
+
from weave.integrations.langchain import WeaveTracer
|
| 10 |
+
|
| 11 |
+
from rag_pipelines.llms.groq import ChatGroqGenerator
|
| 12 |
+
from rag_pipelines.pipelines.crag_graph_state import CRAGGraphState
|
| 13 |
+
from rag_pipelines.query_transformer import QueryTransformer
|
| 14 |
+
from rag_pipelines.retrieval_evaluator import DocumentGrader, QueryDecisionMaker
|
| 15 |
+
from rag_pipelines.websearch import WebSearch
|
| 16 |
+
|
| 17 |
+
# Disable global tracing explicitly
|
| 18 |
+
os.environ["WEAVE_TRACE_LANGCHAIN"] = "false"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class CorrectiveRAGPipeline(weave.Model):
|
| 22 |
+
"""A corrective retrieval-augmented generation (RAG) pipeline using Weave for tracing and LangChain components.
|
| 23 |
+
|
| 24 |
+
This pipeline integrates document retrieval, relevance evaluation, grading, query transformation, web search,
|
| 25 |
+
and LLM-based response generation to implement a corrective RAG system. It utilizes Weave for tracing execution
|
| 26 |
+
details and LangChain components for processing.
|
| 27 |
+
|
| 28 |
+
Attributes:
|
| 29 |
+
retriever (Optional[PineconeHybridSearchRetriever]): The retrieval model used to fetch relevant documents based on a query.
|
| 30 |
+
prompt (Optional[ChatPromptTemplate]): The prompt template to generate questions for the LLM.
|
| 31 |
+
generator (Optional[ChatGroqGenerator]): The language model used to generate responses.
|
| 32 |
+
grader (Optional[DocumentGrader]): Grades documents based on evaluation results.
|
| 33 |
+
query_transformer (Optional[QueryTransformer]): Transforms user queries to optimize retrieval.
|
| 34 |
+
web_search (Optional[WebSearch]): Performs web search for additional context.
|
| 35 |
+
tracing_project_name (str): The name of the Weave project for tracing.
|
| 36 |
+
weave_params (Dict[str, Any]): Parameters for initializing Weave.
|
| 37 |
+
tracer (Optional[WeaveTracer]): The tracer used to record execution details with Weave.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
retriever: Optional[PineconeHybridSearchRetriever] = None
|
| 41 |
+
prompt: Optional[ChatPromptTemplate] = None
|
| 42 |
+
generator: Optional[ChatGroqGenerator] = None
|
| 43 |
+
grader: Optional[DocumentGrader] = None
|
| 44 |
+
query_transformer: Optional[QueryTransformer] = None
|
| 45 |
+
web_search: Optional[WebSearch] = None
|
| 46 |
+
tracing_project_name: str
|
| 47 |
+
weave_params: dict[str, Any]
|
| 48 |
+
tracer: Optional[WeaveTracer] = None
|
| 49 |
+
|
| 50 |
+
def __init__(
|
| 51 |
+
self,
|
| 52 |
+
retriever: PineconeHybridSearchRetriever,
|
| 53 |
+
prompt: ChatPromptTemplate,
|
| 54 |
+
generator: ChatGroqGenerator,
|
| 55 |
+
grader: DocumentGrader,
|
| 56 |
+
query_transformer: QueryTransformer,
|
| 57 |
+
web_search: WebSearch,
|
| 58 |
+
tracing_project_name: str = "corrective_rag",
|
| 59 |
+
weave_params: Optional[dict[str, Any]] = None,
|
| 60 |
+
):
|
| 61 |
+
"""Initialize the CorrectiveRAGPipeline.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
retriever (PineconeHybridSearchRetriever): The retrieval model used to fetch documents for the RAG pipeline.
|
| 65 |
+
prompt (ChatPromptTemplate): The prompt template used to create questions for the LLM.
|
| 66 |
+
generator (ChatGroqGenerator): The language model used for response generation.
|
| 67 |
+
grader (DocumentGrader): Component to grade the relevance of evaluated documents.
|
| 68 |
+
query_transformer (QueryTransformer): Component to transform the user query.
|
| 69 |
+
web_search (WebSearch): Component to perform web search for additional context.
|
| 70 |
+
tracing_project_name (str): The name of the Weave project for tracing. Defaults to "corrective_rag".
|
| 71 |
+
weave_params (Dict[str, Any]): Additional parameters for initializing Weave.
|
| 72 |
+
"""
|
| 73 |
+
if weave_params is None:
|
| 74 |
+
weave_params = {}
|
| 75 |
+
|
| 76 |
+
super().__init__(
|
| 77 |
+
retriever=retriever,
|
| 78 |
+
prompt=prompt,
|
| 79 |
+
generator=generator,
|
| 80 |
+
grader=grader,
|
| 81 |
+
query_transformer=query_transformer,
|
| 82 |
+
web_search=web_search,
|
| 83 |
+
tracing_project_name=tracing_project_name,
|
| 84 |
+
weave_params=weave_params,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self.retriever = retriever
|
| 88 |
+
self.prompt = prompt
|
| 89 |
+
self.generator = generator
|
| 90 |
+
self.grader = grader
|
| 91 |
+
self.query_transformer = query_transformer
|
| 92 |
+
self.web_search = web_search
|
| 93 |
+
self.tracing_project_name = tracing_project_name
|
| 94 |
+
self.weave_params = weave_params
|
| 95 |
+
|
| 96 |
+
self._initialize_weave(**weave_params)
|
| 97 |
+
|
| 98 |
+
def _initialize_weave(self, **weave_params) -> None:
|
| 99 |
+
"""Initialize Weave with the specified tracing project name.
|
| 100 |
+
|
| 101 |
+
Sets up the Weave environment and creates a tracer for monitoring pipeline execution.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
weave_params (Dict[str, Any]): Additional parameters for configuring Weave.
|
| 105 |
+
"""
|
| 106 |
+
weave.init(self.tracing_project_name, **weave_params)
|
| 107 |
+
self.tracer = WeaveTracer()
|
| 108 |
+
|
| 109 |
+
def _build_crag_graph(self) -> CompiledStateGraph:
|
| 110 |
+
"""Build and compile the corrective RAG workflow graph.
|
| 111 |
+
|
| 112 |
+
The graph defines the flow between components like retrieval, grading, query transformation,
|
| 113 |
+
web search, and generation.
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
CompiledStateGraph: The compiled state graph representing the corrective RAG pipeline workflow.
|
| 117 |
+
"""
|
| 118 |
+
crag_workflow = StateGraph(CRAGGraphState)
|
| 119 |
+
|
| 120 |
+
# Define the nodes
|
| 121 |
+
crag_workflow.add_node("retrieve", self.retriever)
|
| 122 |
+
crag_workflow.add_node("grade_documents", self.grader)
|
| 123 |
+
crag_workflow.add_node("generate", self.generator)
|
| 124 |
+
crag_workflow.add_node("transform_query", self.query_transformer)
|
| 125 |
+
crag_workflow.add_node("web_search_node", self.web_search)
|
| 126 |
+
|
| 127 |
+
# Define edges between nodes
|
| 128 |
+
crag_workflow.add_edge(START, "retrieve")
|
| 129 |
+
crag_workflow.add_edge("retrieve", "grade_documents")
|
| 130 |
+
crag_workflow.add_conditional_edges(
|
| 131 |
+
"grade_documents",
|
| 132 |
+
QueryDecisionMaker(),
|
| 133 |
+
{
|
| 134 |
+
"transform_query": "transform_query",
|
| 135 |
+
"generate": "generate",
|
| 136 |
+
},
|
| 137 |
+
)
|
| 138 |
+
crag_workflow.add_edge("transform_query", "web_search_node")
|
| 139 |
+
crag_workflow.add_edge("web_search_node", "generate")
|
| 140 |
+
crag_workflow.add_edge("generate", END)
|
| 141 |
+
|
| 142 |
+
# Compile the graph
|
| 143 |
+
crag_pipeline = crag_workflow.compile()
|
| 144 |
+
|
| 145 |
+
return crag_pipeline
|
| 146 |
+
|
| 147 |
+
@weave.op()
|
| 148 |
+
def predict(self, question: str) -> str:
|
| 149 |
+
"""Execute the corrective RAG pipeline with the given question.
|
| 150 |
+
|
| 151 |
+
The pipeline retrieves documents, evaluates and grades their relevance, and generates a final response
|
| 152 |
+
using the LLM.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
question (str): The input question to be answered.
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
str: The final answer generated by the LLM.
|
| 159 |
+
|
| 160 |
+
Example:
|
| 161 |
+
```python
|
| 162 |
+
pipeline = CorrectiveRAGPipeline(...)
|
| 163 |
+
answer = pipeline.predict("What are the latest AI trends?")
|
| 164 |
+
print(answer)
|
| 165 |
+
```
|
| 166 |
+
"""
|
| 167 |
+
config = {"callbacks": [self.tracer]}
|
| 168 |
+
|
| 169 |
+
crag_graph = self._build_crag_graph()
|
| 170 |
+
response = crag_graph.invoke(question, config=config)
|
| 171 |
+
|
| 172 |
+
return response
|
src/rag_pipelines/pipelines/crag_graph_state.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing_extensions import TypedDict
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class CRAGGraphState(TypedDict):
|
| 5 |
+
"""Represents the state of the graph for the Corrective Retrieval-Augmentation-Generation (CRAG) pipeline.
|
| 6 |
+
|
| 7 |
+
Attributes:
|
| 8 |
+
question (str): The input question for the pipeline.
|
| 9 |
+
generation (str): The generated response from the LLM.
|
| 10 |
+
web_search (str): Indicates whether a web search is required (e.g., "yes" or "no").
|
| 11 |
+
documents (List[str]): A list of relevant documents retrieved or processed.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
question: str
|
| 15 |
+
generation: str
|
| 16 |
+
web_search: str
|
| 17 |
+
documents: list[str]
|
src/rag_pipelines/pipelines/dspy/dspy_baseline_rag.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
from dspy_modules.evaluator import DSPyEvaluator
|
| 6 |
+
from dspy_modules.rag import DSPyRAG
|
| 7 |
+
from dspy_modules.weaviate_db import WeaviateVectorDB
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main(cluster_url, api_key, index_name, model_name, llm_model, llm_api_key):
|
| 11 |
+
# Load dataset
|
| 12 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 13 |
+
questions = earnings_calls_data["question"]
|
| 14 |
+
|
| 15 |
+
# Split into datasets
|
| 16 |
+
[dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 17 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 18 |
+
[dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
|
| 19 |
+
|
| 20 |
+
# Initialize Weaviate VectorDB
|
| 21 |
+
weaviate_db = WeaviateVectorDB(cluster_url, api_key, index_name, model_name)
|
| 22 |
+
|
| 23 |
+
# Initialize LLM
|
| 24 |
+
llm = dspy.LM(llm_model, api_key=llm_api_key, num_retries=120)
|
| 25 |
+
dspy.configure(lm=llm)
|
| 26 |
+
|
| 27 |
+
# Initialize RAG
|
| 28 |
+
rag = DSPyRAG(weaviate_db)
|
| 29 |
+
|
| 30 |
+
# Evaluate before compilation
|
| 31 |
+
evaluator = DSPyEvaluator()
|
| 32 |
+
evaluate = dspy.Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 33 |
+
evaluate(rag, metric=evaluator.llm_metric)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
parser = argparse.ArgumentParser(description="Run DSPy-based RAG pipeline")
|
| 38 |
+
parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL")
|
| 39 |
+
parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key")
|
| 40 |
+
parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name")
|
| 41 |
+
parser.add_argument("--model_name", type=str, required=True, help="Embedding model name")
|
| 42 |
+
parser.add_argument("--llm_model", type=str, required=True, help="LLM model name")
|
| 43 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="LLM API key")
|
| 44 |
+
|
| 45 |
+
args = parser.parse_args()
|
| 46 |
+
main(args.cluster_url, args.api_key, args.index_name, args.model_name, args.llm_model, args.llm_api_key)
|
src/rag_pipelines/pipelines/dspy/dspy_bayesian_signature_optimization_rag.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
import weaviate
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 7 |
+
from dspy.primitives.prediction import Prediction
|
| 8 |
+
from dspy.teleprompt import BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
|
| 9 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 10 |
+
from langchain_weaviate.vectorstores import WeaviateVectorStore
|
| 11 |
+
from weaviate.classes.init import Auth
|
| 12 |
+
|
| 13 |
+
# Argument Parser
|
| 14 |
+
parser = argparse.ArgumentParser(description="RAG Optimization with DSPy")
|
| 15 |
+
parser.add_argument(
|
| 16 |
+
"--optimizer",
|
| 17 |
+
type=str,
|
| 18 |
+
choices=["bootstrap", "bayesian"],
|
| 19 |
+
default="bootstrap",
|
| 20 |
+
help="Choose the optimization method",
|
| 21 |
+
)
|
| 22 |
+
args = parser.parse_args()
|
| 23 |
+
|
| 24 |
+
# Load dataset
|
| 25 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 26 |
+
questions = earnings_calls_data["question"]
|
| 27 |
+
|
| 28 |
+
# Create DSPy datasets
|
| 29 |
+
trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 30 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 31 |
+
testset = [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
|
| 32 |
+
|
| 33 |
+
# Embeddings and Weaviate client
|
| 34 |
+
embeddings = HuggingFaceEmbeddings(
|
| 35 |
+
model_name="jinaai/jina-embeddings-v3",
|
| 36 |
+
model_kwargs={"device": "cpu", "trust_remote_code": True},
|
| 37 |
+
encode_kwargs={"task": "retrieval.query", "prompt_name": "retrieval.query"},
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
weaviate_client = weaviate.connect_to_weaviate_cloud(
|
| 41 |
+
cluster_url="https://adrrwus9shkxkuijvazcrq.c0.us-west3.gcp.weaviate.cloud",
|
| 42 |
+
auth_credentials=Auth.api_key("J94gHySMWTWxggDDayGrF2ESGo23yOHZ1bUC"),
|
| 43 |
+
)
|
| 44 |
+
weaviate_db = WeaviateVectorStore(
|
| 45 |
+
index_name="LangChain_d73ad6159d514fec887456fa6db11e61",
|
| 46 |
+
embedding=embeddings,
|
| 47 |
+
client=weaviate_client,
|
| 48 |
+
text_key="text",
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Configure LLM
|
| 52 |
+
llm = dspy.LM(
|
| 53 |
+
"groq/llama-3.3-70b-versatile",
|
| 54 |
+
api_key="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",
|
| 55 |
+
num_retries=120,
|
| 56 |
+
)
|
| 57 |
+
dspy.configure(lm=llm)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Define DSPy Module
|
| 61 |
+
class GenerateAnswer(dspy.Signature):
|
| 62 |
+
context = dspy.InputField(desc="may contain relevant facts")
|
| 63 |
+
question = dspy.InputField()
|
| 64 |
+
answer = dspy.OutputField(desc="short and precise answer")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class RAG(dspy.Module):
|
| 68 |
+
def __init__(self):
|
| 69 |
+
super().__init__()
|
| 70 |
+
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
|
| 71 |
+
|
| 72 |
+
def retrieve(self, question):
|
| 73 |
+
results = weaviate_db.similarity_search(query=question)
|
| 74 |
+
passages = [res.page_content for res in results]
|
| 75 |
+
return Prediction(passages=passages)
|
| 76 |
+
|
| 77 |
+
def forward(self, question):
|
| 78 |
+
context = self.retrieve(question).passages
|
| 79 |
+
prediction = self.generate_answer(context=context, question=question)
|
| 80 |
+
return dspy.Prediction(context=context, answer=prediction.answer)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# Define LLM Metric
|
| 84 |
+
def llm_metric(gold, pred, trace=None):
|
| 85 |
+
predicted_answer = pred.answer
|
| 86 |
+
context = pred.context
|
| 87 |
+
detail = dspy.ChainOfThought(GenerateAnswer)(
|
| 88 |
+
context="N/A", assessed_question="Is the answer detailed?", assessed_answer=predicted_answer
|
| 89 |
+
)
|
| 90 |
+
faithful = dspy.ChainOfThought(GenerateAnswer)(
|
| 91 |
+
context=context, assessed_question="Is it grounded in context?", assessed_answer=predicted_answer
|
| 92 |
+
)
|
| 93 |
+
overall = dspy.ChainOfThought(GenerateAnswer)(
|
| 94 |
+
context=context, assessed_question=f"Rate the answer: {predicted_answer}", assessed_answer=predicted_answer
|
| 95 |
+
)
|
| 96 |
+
total = float(detail.answer) + float(faithful.answer) * 2 + float(overall.answer)
|
| 97 |
+
return total / 5.0
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# Evaluate before optimization
|
| 101 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 102 |
+
evaluate(RAG(), metric=llm_metric)
|
| 103 |
+
|
| 104 |
+
# Select Optimizer
|
| 105 |
+
if args.optimizer == "bootstrap":
|
| 106 |
+
optimizer = BootstrapFewShotWithRandomSearch(
|
| 107 |
+
metric=llm_metric,
|
| 108 |
+
max_bootstrapped_demos=4,
|
| 109 |
+
max_labeled_demos=4,
|
| 110 |
+
max_rounds=1,
|
| 111 |
+
num_candidate_programs=2,
|
| 112 |
+
num_threads=2,
|
| 113 |
+
)
|
| 114 |
+
else:
|
| 115 |
+
optimizer = BayesianSignatureOptimizer(
|
| 116 |
+
task_model=dspy.settings.lm, metric=llm_metric, prompt_model=dspy.settings.lm, n=5, verbose=False
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Compile optimized RAG
|
| 120 |
+
optimized_compiled_rag = optimizer.compile(RAG(), trainset=trainset)
|
| 121 |
+
|
| 122 |
+
# Evaluate optimized RAG
|
| 123 |
+
evaluate = Evaluate(metric=llm_metric, devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 124 |
+
evaluate(optimized_compiled_rag)
|
src/rag_pipelines/pipelines/dspy/dspy_bootstrap_few_shot_optimization_rag.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 6 |
+
from dspy.teleprompt import BootstrapFewShot
|
| 7 |
+
from dspy_modules.evaluator import llm_metric
|
| 8 |
+
from dspy_modules.rag import RAG
|
| 9 |
+
from dspy_modules.weaviate_db import WeaviateVectorDB
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def main(args):
|
| 13 |
+
# Load dataset
|
| 14 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 15 |
+
questions = earnings_calls_data["question"]
|
| 16 |
+
|
| 17 |
+
# Split dataset
|
| 18 |
+
trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 19 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 20 |
+
[dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
|
| 21 |
+
|
| 22 |
+
# Initialize Weaviate VectorDB
|
| 23 |
+
weaviate_db = WeaviateVectorDB(
|
| 24 |
+
cluster_url=args.cluster_url, api_key=args.api_key, index_name=args.index_name, model_name=args.embedding_model
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Initialize LLM
|
| 28 |
+
llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
|
| 29 |
+
dspy.configure(lm=llm)
|
| 30 |
+
|
| 31 |
+
# Initialize and evaluate unoptimized RAG
|
| 32 |
+
uncompiled_rag = RAG(weaviate_db)
|
| 33 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 34 |
+
evaluate(uncompiled_rag, metric=llm_metric)
|
| 35 |
+
|
| 36 |
+
# Optimize RAG using BootstrapFewShot
|
| 37 |
+
optimizer = BootstrapFewShot(metric=llm_metric)
|
| 38 |
+
optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
|
| 39 |
+
|
| 40 |
+
# Evaluate optimized RAG
|
| 41 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 42 |
+
evaluate(optimized_compiled_rag)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
|
| 47 |
+
|
| 48 |
+
# Weaviate parameters
|
| 49 |
+
parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL")
|
| 50 |
+
parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key")
|
| 51 |
+
parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name")
|
| 52 |
+
parser.add_argument("--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model name")
|
| 53 |
+
|
| 54 |
+
# LLM parameters
|
| 55 |
+
parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name")
|
| 56 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="LLM API key")
|
| 57 |
+
parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM calls")
|
| 58 |
+
|
| 59 |
+
args = parser.parse_args()
|
| 60 |
+
main(args)
|
src/rag_pipelines/pipelines/dspy/dspy_copro_instruction_signature_optimization_rag.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# https://github.com/weaviate/recipes/blob/main/integrations/llm-frameworks/dspy/1.Getting-Started-with-RAG-in-DSPy.ipynb
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
import weaviate
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 7 |
+
from dspy.primitives.prediction import Prediction
|
| 8 |
+
from dspy.teleprompt import COPRO
|
| 9 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 10 |
+
from langchain_weaviate.vectorstores import WeaviateVectorStore
|
| 11 |
+
from weaviate.classes.init import Auth
|
| 12 |
+
|
| 13 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 14 |
+
questions = earnings_calls_data["question"]
|
| 15 |
+
|
| 16 |
+
# Create the dspy datasets
|
| 17 |
+
trainset = questions[:20] # 20 examples for training
|
| 18 |
+
devset = questions[20:30] # 10 examples for development
|
| 19 |
+
testset = questions[30:] # 20 examples for testing
|
| 20 |
+
|
| 21 |
+
trainset = [dspy.Example(question=question).with_inputs("question") for question in trainset]
|
| 22 |
+
devset = [dspy.Example(question=question).with_inputs("question") for question in devset]
|
| 23 |
+
testset = [dspy.Example(question=question).with_inputs("question") for question in testset]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
model_name = "jinaai/jina-embeddings-v3"
|
| 27 |
+
task = "retrieval.query"
|
| 28 |
+
model_kwargs = {"device": "cpu", "trust_remote_code": True}
|
| 29 |
+
encode_kwargs = {"task": task, "prompt_name": task}
|
| 30 |
+
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
weaviate_client = weaviate.connect_to_weaviate_cloud(
|
| 34 |
+
cluster_url="https://adrrwus9shkxkuijvazcrq.c0.us-west3.gcp.weaviate.cloud",
|
| 35 |
+
auth_credentials=Auth.api_key("J94gHySMWTWxggDDayGrF2ESGo23yOHZ1bUC"),
|
| 36 |
+
)
|
| 37 |
+
weaviate_db = WeaviateVectorStore(
|
| 38 |
+
index_name="LangChain_d73ad6159d514fec887456fa6db11e61",
|
| 39 |
+
embedding=embeddings,
|
| 40 |
+
client=weaviate_client,
|
| 41 |
+
text_key="text",
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
llm = dspy.LM(
|
| 46 |
+
"groq/llama-3.3-70b-versatile",
|
| 47 |
+
api_key="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",
|
| 48 |
+
num_retries=120,
|
| 49 |
+
)
|
| 50 |
+
dspy.configure(lm=llm)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class GenerateAnswer(dspy.Signature):
|
| 54 |
+
"""Answer questions with short factoid answers."""
|
| 55 |
+
|
| 56 |
+
context = dspy.InputField(desc="may contain relevant facts")
|
| 57 |
+
question = dspy.InputField()
|
| 58 |
+
answer = dspy.OutputField(desc="short and precise answer")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class RAG(dspy.Module):
|
| 62 |
+
def __init__(self):
|
| 63 |
+
super().__init__()
|
| 64 |
+
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
|
| 65 |
+
|
| 66 |
+
# This makes it possible to use the Langchain VectorDB integration and custom embeddings with SentenceTransformers
|
| 67 |
+
def retrieve(self, question):
|
| 68 |
+
results = weaviate_db.similarity_search(query=question)
|
| 69 |
+
passages = [res.page_content for res in results]
|
| 70 |
+
return Prediction(passages=passages)
|
| 71 |
+
|
| 72 |
+
def forward(self, question):
|
| 73 |
+
context = self.retrieve(question).passages
|
| 74 |
+
prediction = self.generate_answer(context=context, question=question)
|
| 75 |
+
return dspy.Prediction(context=context, answer=prediction.answer)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# Create an LLM as a Judge Evaluation Metric for evaluation of the RAG Pipelines
|
| 79 |
+
# (Taken from weaviate recipe)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class Assess(dspy.Signature):
|
| 83 |
+
"""Assess the quality of an answer to a question."""
|
| 84 |
+
|
| 85 |
+
context = dspy.InputField(desc="The context for answering the question.")
|
| 86 |
+
assessed_question = dspy.InputField(desc="The evaluation criterion.")
|
| 87 |
+
assessed_answer = dspy.InputField(desc="The answer to the question.")
|
| 88 |
+
assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def llm_metric(gold, pred, trace=None):
|
| 92 |
+
predicted_answer = pred.answer
|
| 93 |
+
context = pred.context
|
| 94 |
+
question = gold.question
|
| 95 |
+
|
| 96 |
+
print(f"Test Question: {question}")
|
| 97 |
+
print(f"Predicted Answer: {predicted_answer}")
|
| 98 |
+
|
| 99 |
+
detail = "Is the assessed answer detailed?"
|
| 100 |
+
faithful = (
|
| 101 |
+
"Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."
|
| 102 |
+
)
|
| 103 |
+
overall = f"Please rate how well this answer answers the question, `{question}` based on the context.\n `{predicted_answer}`"
|
| 104 |
+
|
| 105 |
+
detail = dspy.ChainOfThought(Assess)(context="N/A", assessed_question=detail, assessed_answer=predicted_answer)
|
| 106 |
+
faithful = dspy.ChainOfThought(Assess)(
|
| 107 |
+
context=context, assessed_question=faithful, assessed_answer=predicted_answer
|
| 108 |
+
)
|
| 109 |
+
overall = dspy.ChainOfThought(Assess)(context=context, assessed_question=overall, assessed_answer=predicted_answer)
|
| 110 |
+
|
| 111 |
+
print(f"Faithful: {faithful.assessment_answer}")
|
| 112 |
+
print(f"Detail: {detail.assessment_answer}")
|
| 113 |
+
print(f"Overall: {overall.assessment_answer}")
|
| 114 |
+
|
| 115 |
+
total = float(detail.assessment_answer) + float(faithful.assessment_answer) * 2 + float(overall.assessment_answer)
|
| 116 |
+
|
| 117 |
+
return total / 5.0
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Evaluate our RAG Program before it is compiled
|
| 121 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 122 |
+
evaluate(RAG(), metric=llm_metric)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# Optimize the RAG Program
|
| 126 |
+
optimizer = COPRO(
|
| 127 |
+
prompt_model=dspy.settings.lm,
|
| 128 |
+
metric=llm_metric,
|
| 129 |
+
breadth=3,
|
| 130 |
+
depth=2,
|
| 131 |
+
init_temperature=0.25,
|
| 132 |
+
verbose=False,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
optimized_compiled_rag = optimizer.compile(
|
| 137 |
+
RAG(),
|
| 138 |
+
trainset=trainset,
|
| 139 |
+
eval_kwargs={"num_threads": 1, "display_progress": True, "display_table": 0},
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Evaluate the optimized RAG Program
|
| 143 |
+
evaluate = Evaluate(
|
| 144 |
+
metric=llm_metric,
|
| 145 |
+
devset=devset,
|
| 146 |
+
num_threads=1,
|
| 147 |
+
display_progress=True,
|
| 148 |
+
display_table=5,
|
| 149 |
+
)
|
| 150 |
+
evaluate(optimized_compiled_rag)
|
src/rag_pipelines/pipelines/dspy_baseline_rag.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
|
| 6 |
+
from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
|
| 7 |
+
from rag_pipelines.dspy.dspy_rag import DSPyRAG
|
| 8 |
+
from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def main(cluster_url, api_key, index_name, model_name, llm_model, llm_api_key):
|
| 12 |
+
"""Executes the DSPy-based Retrieval-Augmented Generation (RAG) pipeline.
|
| 13 |
+
|
| 14 |
+
This function:
|
| 15 |
+
1. Loads a dataset of earnings call Q&A pairs.
|
| 16 |
+
2. Prepares development (dev) and test datasets for evaluation.
|
| 17 |
+
3. Initializes a Weaviate vector database for storing and retrieving embeddings.
|
| 18 |
+
4. Configures a Large Language Model (LLM) with DSPy.
|
| 19 |
+
5. Instantiates and evaluates the RAG pipeline before optimization.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
cluster_url (str): URL of the Weaviate vector database cluster.
|
| 23 |
+
api_key (str): API key for authenticating access to Weaviate.
|
| 24 |
+
index_name (str): Name of the Weaviate index for storing vectors.
|
| 25 |
+
model_name (str): Embedding model name for vectorization.
|
| 26 |
+
llm_model (str): Name of the LLM used for inference.
|
| 27 |
+
llm_api_key (str): API key for accessing the LLM.
|
| 28 |
+
"""
|
| 29 |
+
# Load the earnings calls Q&A dataset (first 50 samples)
|
| 30 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 31 |
+
questions = earnings_calls_data["question"]
|
| 32 |
+
|
| 33 |
+
# Prepare dataset splits:
|
| 34 |
+
# - The first 20 questions are used for training (not explicitly utilized here).
|
| 35 |
+
# - The next 10 questions are used as the development set (devset) for evaluation.
|
| 36 |
+
# - The remaining questions are part of the test set (not used in this script).
|
| 37 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 38 |
+
|
| 39 |
+
# Initialize Weaviate VectorDB for embedding storage and retrieval
|
| 40 |
+
weaviate_db = WeaviateVectorDB(
|
| 41 |
+
cluster_url=cluster_url, # Weaviate cluster URL
|
| 42 |
+
api_key=api_key, # API key for authentication
|
| 43 |
+
index_name=index_name, # Name of the index for vector storage
|
| 44 |
+
model_name=model_name, # Embedding model used for vectorization
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Initialize the LLM with DSPy
|
| 48 |
+
llm = dspy.LM(llm_model, api_key=llm_api_key, num_retries=120)
|
| 49 |
+
dspy.configure(lm=llm) # Set DSPy’s global LLM configuration
|
| 50 |
+
|
| 51 |
+
# Instantiate the RAG pipeline
|
| 52 |
+
rag = DSPyRAG(weaviate_db)
|
| 53 |
+
|
| 54 |
+
# Initialize evaluator for measuring LLM-based retrieval performance
|
| 55 |
+
evaluator = DSPyEvaluator()
|
| 56 |
+
|
| 57 |
+
# Evaluate the RAG pipeline before optimization
|
| 58 |
+
evaluate = dspy.Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 59 |
+
evaluate(rag, metric=evaluator.llm_metric)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
"""
|
| 64 |
+
Parses command-line arguments and runs the DSPy-based RAG pipeline.
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
parser = argparse.ArgumentParser(description="Run DSPy-based RAG pipeline")
|
| 68 |
+
|
| 69 |
+
# Weaviate configuration parameters
|
| 70 |
+
parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
|
| 71 |
+
parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
|
| 72 |
+
parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
|
| 73 |
+
parser.add_argument("--model_name", type=str, required=True, help="Embedding model name for vectorization.")
|
| 74 |
+
|
| 75 |
+
# LLM configuration parameters
|
| 76 |
+
parser.add_argument("--llm_model", type=str, required=True, help="LLM model name.")
|
| 77 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM access.")
|
| 78 |
+
|
| 79 |
+
# Parse command-line arguments and execute the pipeline
|
| 80 |
+
args = parser.parse_args()
|
| 81 |
+
main(args.cluster_url, args.api_key, args.index_name, args.model_name, args.llm_model, args.llm_api_key)
|
src/rag_pipelines/pipelines/dspy_bayesian_signature_optimization_rag.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
import weaviate
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 7 |
+
from dspy.teleprompt import BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
|
| 8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 9 |
+
from weaviate.classes.init import Auth
|
| 10 |
+
|
| 11 |
+
from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
|
| 12 |
+
from rag_pipelines.dspy.dspy_rag import DSPyRAG
|
| 13 |
+
from rag_pipelines.vectordb.weaviate import WeaviateVectorStore
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def parse_args():
|
| 17 |
+
"""Parse command-line arguments."""
|
| 18 |
+
parser = argparse.ArgumentParser(description="Optimize and evaluate RAG pipeline with DSPy.")
|
| 19 |
+
|
| 20 |
+
# Dataset Arguments
|
| 21 |
+
parser.add_argument(
|
| 22 |
+
"--dataset_name", type=str, default="lamini/earnings-calls-qa", help="Name of the dataset to use."
|
| 23 |
+
)
|
| 24 |
+
parser.add_argument("--dataset_size", type=int, default=50, help="Number of examples to load from the dataset.")
|
| 25 |
+
|
| 26 |
+
# Weaviate Configuration
|
| 27 |
+
parser.add_argument("--weaviate_url", type=str, required=True, help="Weaviate cloud cluster URL.")
|
| 28 |
+
parser.add_argument("--weaviate_api_key", type=str, required=True, help="API key for Weaviate.")
|
| 29 |
+
parser.add_argument("--index_name", type=str, required=True, help="Index name in Weaviate.")
|
| 30 |
+
parser.add_argument(
|
| 31 |
+
"--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model for Weaviate."
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# LLM Configuration
|
| 35 |
+
parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model to use.")
|
| 36 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM.")
|
| 37 |
+
|
| 38 |
+
# Optimization Method
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--optimizer",
|
| 41 |
+
type=str,
|
| 42 |
+
choices=["bootstrap", "bayesian"],
|
| 43 |
+
default="bootstrap",
|
| 44 |
+
help="Choose the optimization method.",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
return parser.parse_args()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def main():
|
| 51 |
+
args = parse_args()
|
| 52 |
+
|
| 53 |
+
# Load dataset
|
| 54 |
+
dataset = load_dataset(args.dataset_name, split=f"train[:{args.dataset_size}]")
|
| 55 |
+
questions = dataset["question"]
|
| 56 |
+
|
| 57 |
+
# Create DSPy datasets
|
| 58 |
+
trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 59 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 60 |
+
testset = [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
|
| 61 |
+
|
| 62 |
+
# Initialize embeddings
|
| 63 |
+
model_kwargs = {"device": "cpu", "trust_remote_code": True}
|
| 64 |
+
encode_kwargs = {"task": "retrieval.query", "prompt_name": "retrieval.query"}
|
| 65 |
+
embeddings = HuggingFaceEmbeddings(
|
| 66 |
+
model_name=args.embedding_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Connect to Weaviate
|
| 70 |
+
weaviate_client = weaviate.connect_to_weaviate_cloud(
|
| 71 |
+
cluster_url=args.weaviate_url,
|
| 72 |
+
auth_credentials=Auth.api_key(args.weaviate_api_key),
|
| 73 |
+
)
|
| 74 |
+
WeaviateVectorStore(
|
| 75 |
+
index_name=args.index_name,
|
| 76 |
+
embedding=embeddings,
|
| 77 |
+
client=weaviate_client,
|
| 78 |
+
text_key="text",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Configure LLM
|
| 82 |
+
llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=120)
|
| 83 |
+
dspy.configure(lm=llm)
|
| 84 |
+
|
| 85 |
+
# Evaluate before optimization
|
| 86 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 87 |
+
evaluate(DSPyRAG(), metric=DSPyEvaluator.llm_metric())
|
| 88 |
+
|
| 89 |
+
# Select Optimizer
|
| 90 |
+
if args.optimizer == "bootstrap":
|
| 91 |
+
optimizer = BootstrapFewShotWithRandomSearch(
|
| 92 |
+
metric=DSPyEvaluator.llm_metric(),
|
| 93 |
+
max_bootstrapped_demos=4,
|
| 94 |
+
max_labeled_demos=4,
|
| 95 |
+
max_rounds=1,
|
| 96 |
+
num_candidate_programs=2,
|
| 97 |
+
num_threads=2,
|
| 98 |
+
)
|
| 99 |
+
else:
|
| 100 |
+
optimizer = BayesianSignatureOptimizer(
|
| 101 |
+
task_model=dspy.settings.lm,
|
| 102 |
+
metric=DSPyEvaluator.llm_metric(),
|
| 103 |
+
prompt_model=dspy.settings.lm,
|
| 104 |
+
n=5,
|
| 105 |
+
verbose=False,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Compile optimized RAG
|
| 109 |
+
optimized_compiled_rag = optimizer.compile(DSPyRAG(), testset=testset, trainset=trainset)
|
| 110 |
+
|
| 111 |
+
# Evaluate optimized RAG
|
| 112 |
+
evaluate = Evaluate(
|
| 113 |
+
metric=DSPyEvaluator.llm_metric(), devset=devset, num_threads=1, display_progress=True, display_table=5
|
| 114 |
+
)
|
| 115 |
+
evaluate(optimized_compiled_rag)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_optimization_rag.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 6 |
+
from dspy.teleprompt import BootstrapFewShot
|
| 7 |
+
|
| 8 |
+
from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
|
| 9 |
+
from rag_pipelines.dspy.dspy_rag import DSPyRAG
|
| 10 |
+
from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main(args):
|
| 14 |
+
"""Runs the DSPy RAG optimization pipeline.
|
| 15 |
+
|
| 16 |
+
This function:
|
| 17 |
+
1. Loads the earnings calls dataset.
|
| 18 |
+
2. Splits the dataset into training, development, and test sets.
|
| 19 |
+
3. Initializes a Weaviate vector database and an LLM.
|
| 20 |
+
4. Evaluates an unoptimized RAG pipeline.
|
| 21 |
+
5. Optimizes the RAG pipeline using BootstrapFewShot.
|
| 22 |
+
6. Evaluates the optimized RAG pipeline.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
args (argparse.Namespace): Command-line arguments for configuring the pipeline.
|
| 26 |
+
"""
|
| 27 |
+
# Load the dataset (Earnings Calls QA dataset)
|
| 28 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 29 |
+
questions = earnings_calls_data["question"]
|
| 30 |
+
|
| 31 |
+
# Split the dataset into training (20), development (10), and test sets
|
| 32 |
+
trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 33 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 34 |
+
[dspy.Example(question=q).with_inputs("question") for q in questions[30:]] # Test set (not used in this script)
|
| 35 |
+
|
| 36 |
+
# Initialize Weaviate VectorDB for storing and retrieving embeddings
|
| 37 |
+
weaviate_db = WeaviateVectorDB(
|
| 38 |
+
cluster_url=args.cluster_url, # URL of the Weaviate cluster
|
| 39 |
+
api_key=args.api_key, # API key for authentication
|
| 40 |
+
index_name=args.index_name, # Name of the Weaviate index
|
| 41 |
+
model_name=args.embedding_model, # Embedding model to use for vector storage
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Initialize LLM with DSPy
|
| 45 |
+
llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
|
| 46 |
+
dspy.configure(lm=llm) # Set DSPy's global configuration for LLM usage
|
| 47 |
+
|
| 48 |
+
# Initialize the unoptimized RAG pipeline
|
| 49 |
+
uncompiled_rag = DSPyRAG(weaviate_db)
|
| 50 |
+
|
| 51 |
+
# Evaluate the unoptimized RAG pipeline using the development set
|
| 52 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 53 |
+
evaluate(uncompiled_rag, metric=DSPyEvaluator.llm_metric())
|
| 54 |
+
|
| 55 |
+
# Optimize the RAG pipeline using BootstrapFewShot
|
| 56 |
+
optimizer = BootstrapFewShot(metric=DSPyEvaluator.llm_metric())
|
| 57 |
+
|
| 58 |
+
# Compile an optimized version of the RAG model using the training set
|
| 59 |
+
optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
|
| 60 |
+
|
| 61 |
+
# Evaluate the optimized RAG pipeline
|
| 62 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 63 |
+
evaluate(optimized_compiled_rag)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
"""
|
| 68 |
+
Parses command-line arguments and runs the main function.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
|
| 72 |
+
|
| 73 |
+
# Weaviate parameters (for vector storage and retrieval)
|
| 74 |
+
parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
|
| 75 |
+
parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
|
| 76 |
+
parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
|
| 77 |
+
parser.add_argument(
|
| 78 |
+
"--embedding_model",
|
| 79 |
+
type=str,
|
| 80 |
+
default="jinaai/jina-embeddings-v3",
|
| 81 |
+
help="Embedding model used for document vectorization.",
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# LLM parameters (for DSPy-based language model inference)
|
| 85 |
+
parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name.")
|
| 86 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="API key for accessing the LLM service.")
|
| 87 |
+
parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM API calls.")
|
| 88 |
+
|
| 89 |
+
# Parse command-line arguments and execute the pipeline
|
| 90 |
+
args = parser.parse_args()
|
| 91 |
+
main(args)
|
src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_random_search_optimization_rag.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
import dspy
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 7 |
+
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
|
| 8 |
+
|
| 9 |
+
from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
|
| 10 |
+
from rag_pipelines.dspy.dspy_rag import DSPyRAG
|
| 11 |
+
from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main(args):
|
| 15 |
+
"""Main function to run the DSPy RAG optimization pipeline.
|
| 16 |
+
|
| 17 |
+
This function loads a dataset, initializes a Weaviate vector database and an LLM,
|
| 18 |
+
evaluates an unoptimized RAG pipeline, optimizes it using BootstrapFewShotWithRandomSearch,
|
| 19 |
+
and then evaluates the optimized pipeline.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
args (argparse.Namespace): Command-line arguments for configuring the pipeline.
|
| 23 |
+
"""
|
| 24 |
+
# Load dataset (Earnings Calls QA)
|
| 25 |
+
earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
|
| 26 |
+
questions = earnings_calls_data["question"]
|
| 27 |
+
|
| 28 |
+
# Split dataset into training, development, and test sets
|
| 29 |
+
trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 30 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 31 |
+
[dspy.Example(question=q).with_inputs("question") for q in questions[30:]] # Test set (not used here)
|
| 32 |
+
|
| 33 |
+
# Initialize Weaviate Vector Database
|
| 34 |
+
weaviate_db = WeaviateVectorDB(
|
| 35 |
+
cluster_url=args.cluster_url,
|
| 36 |
+
api_key=args.api_key,
|
| 37 |
+
index_name=args.index_name,
|
| 38 |
+
model_name=args.embedding_model,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Initialize the LLM
|
| 42 |
+
llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
|
| 43 |
+
dspy.configure(lm=llm) # Set DSPy's global LLM configuration
|
| 44 |
+
|
| 45 |
+
# Initialize the unoptimized RAG pipeline
|
| 46 |
+
uncompiled_rag = DSPyRAG(weaviate_db)
|
| 47 |
+
|
| 48 |
+
# Evaluate the unoptimized RAG model
|
| 49 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 50 |
+
evaluate(uncompiled_rag, metric=DSPyEvaluator.llm_metric())
|
| 51 |
+
|
| 52 |
+
# Optimize RAG using BootstrapFewShotWithRandomSearch
|
| 53 |
+
optimizer = BootstrapFewShotWithRandomSearch(
|
| 54 |
+
metric=DSPyEvaluator.llm_metric(),
|
| 55 |
+
max_bootstrapped_demos=args.max_bootstrapped_demos,
|
| 56 |
+
max_labeled_demos=args.max_labeled_demos,
|
| 57 |
+
max_rounds=args.max_rounds,
|
| 58 |
+
num_candidate_programs=args.num_candidate_programs,
|
| 59 |
+
num_threads=args.num_threads,
|
| 60 |
+
num_threads=args.num_threads,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Compile an optimized version of the RAG model
|
| 64 |
+
optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
|
| 65 |
+
|
| 66 |
+
# Evaluate the optimized RAG model
|
| 67 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 68 |
+
evaluate(optimized_compiled_rag)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
"""
|
| 73 |
+
Parses command-line arguments and runs the main function.
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
|
| 77 |
+
|
| 78 |
+
# Weaviate parameters
|
| 79 |
+
parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
|
| 80 |
+
parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
|
| 81 |
+
parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
|
| 82 |
+
parser.add_argument(
|
| 83 |
+
"--embedding_model",
|
| 84 |
+
type=str,
|
| 85 |
+
default="jinaai/jina-embeddings-v3",
|
| 86 |
+
help="Embedding model to use for vector retrieval.",
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# LLM parameters
|
| 90 |
+
parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name.")
|
| 91 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="API key for accessing the LLM.")
|
| 92 |
+
parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM calls.")
|
| 93 |
+
|
| 94 |
+
# Optimization parameters
|
| 95 |
+
parser.add_argument("--max_bootstrapped_demos", type=int, default=4, help="Max bootstrapped demonstrations.")
|
| 96 |
+
parser.add_argument("--max_labeled_demos", type=int, default=4, help="Max labeled demonstrations.")
|
| 97 |
+
parser.add_argument("--max_rounds", type=int, default=1, help="Max optimization rounds.")
|
| 98 |
+
parser.add_argument("--num_candidate_programs", type=int, default=2, help="Number of candidate programs.")
|
| 99 |
+
parser.add_argument("--num_threads", type=int, default=2, help="Number of threads for optimization.")
|
| 100 |
+
|
| 101 |
+
# Parse arguments and run the main function
|
| 102 |
+
args = parser.parse_args()
|
| 103 |
+
main(args)
|
src/rag_pipelines/pipelines/dspy_copro_instruction_signature_optimization_rag.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
import weaviate
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from dspy.evaluate.evaluate import Evaluate
|
| 7 |
+
from dspy.teleprompt import COPRO
|
| 8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 9 |
+
from weaviate.classes.init import Auth
|
| 10 |
+
|
| 11 |
+
from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
|
| 12 |
+
from rag_pipelines.dspy.dspy_rag import DSPyRAG
|
| 13 |
+
from rag_pipelines.vectordb.weaviate import WeaviateVectorStore
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def parse_args():
|
| 17 |
+
"""Parse command-line arguments for the DSPy RAG pipeline with Weaviate and LLM evaluation.
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
argparse.Namespace: The parsed command-line arguments.
|
| 21 |
+
"""
|
| 22 |
+
parser = argparse.ArgumentParser(description="Run DSPy RAG pipeline with Weaviate and LLM evaluation.")
|
| 23 |
+
|
| 24 |
+
# Dataset Arguments
|
| 25 |
+
parser.add_argument(
|
| 26 |
+
"--dataset_name", type=str, default="lamini/earnings-calls-qa", help="Name of the dataset to use."
|
| 27 |
+
)
|
| 28 |
+
parser.add_argument("--dataset_size", type=int, default=50, help="Number of examples to load from the dataset.")
|
| 29 |
+
|
| 30 |
+
# Weaviate Configuration
|
| 31 |
+
parser.add_argument("--weaviate_url", type=str, required=True, help="Weaviate cloud cluster URL.")
|
| 32 |
+
parser.add_argument("--weaviate_api_key", type=str, required=True, help="API key for Weaviate.")
|
| 33 |
+
parser.add_argument("--index_name", type=str, required=True, help="Index name in Weaviate.")
|
| 34 |
+
parser.add_argument(
|
| 35 |
+
"--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model for Weaviate."
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# LLM Configuration
|
| 39 |
+
parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model to use.")
|
| 40 |
+
parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM.")
|
| 41 |
+
|
| 42 |
+
return parser.parse_args()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def main():
|
| 46 |
+
"""Run the DSPy RAG pipeline, including dataset loading, embedding initialization, Weaviate connection, LLM evaluation, and model optimization.
|
| 47 |
+
|
| 48 |
+
This function orchestrates the entire pipeline from loading data, preparing datasets,
|
| 49 |
+
connecting to Weaviate, initializing embeddings, evaluating the model, and optimizing the RAG pipeline.
|
| 50 |
+
"""
|
| 51 |
+
# Parse command-line arguments
|
| 52 |
+
args = parse_args()
|
| 53 |
+
|
| 54 |
+
# Load dataset from Hugging Face and extract questions
|
| 55 |
+
dataset = load_dataset(args.dataset_name, split=f"train[:{args.dataset_size}]")
|
| 56 |
+
questions = dataset["question"]
|
| 57 |
+
|
| 58 |
+
# Create DSPy datasets for training and evaluation
|
| 59 |
+
trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
|
| 60 |
+
devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
|
| 61 |
+
[dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
|
| 62 |
+
|
| 63 |
+
# Initialize HuggingFace embeddings for retrieval tasks
|
| 64 |
+
model_kwargs = {"device": "cpu", "trust_remote_code": True}
|
| 65 |
+
encode_kwargs = {"task": "retrieval.query", "prompt_name": "retrieval.query"}
|
| 66 |
+
embeddings = HuggingFaceEmbeddings(
|
| 67 |
+
model_name=args.embedding_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Connect to Weaviate using the provided URL and API key
|
| 71 |
+
weaviate_client = weaviate.connect_to_weaviate_cloud(
|
| 72 |
+
cluster_url=args.weaviate_url,
|
| 73 |
+
auth_credentials=Auth.api_key(args.weaviate_api_key),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Initialize Weaviate vector store with the embeddings and client connection
|
| 77 |
+
WeaviateVectorStore(
|
| 78 |
+
index_name=args.index_name,
|
| 79 |
+
embedding=embeddings,
|
| 80 |
+
client=weaviate_client,
|
| 81 |
+
text_key="text",
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Initialize the LLM (Language Model) with the specified model and API key
|
| 85 |
+
llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=120)
|
| 86 |
+
dspy.configure(lm=llm)
|
| 87 |
+
|
| 88 |
+
# Evaluate the initial RAG pipeline
|
| 89 |
+
evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
|
| 90 |
+
evaluate(DSPyRAG(), metric=DSPyEvaluator.llm_metric())
|
| 91 |
+
|
| 92 |
+
# Optimize the RAG model using COPRO (Collaborative Prompt Optimization)
|
| 93 |
+
optimizer = COPRO(
|
| 94 |
+
prompt_model=dspy.settings.lm,
|
| 95 |
+
metric=DSPyEvaluator.llm_metric(),
|
| 96 |
+
breadth=3,
|
| 97 |
+
depth=2,
|
| 98 |
+
init_temperature=0.25,
|
| 99 |
+
verbose=False,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Compile the optimized RAG model with the training set
|
| 103 |
+
optimized_compiled_rag = optimizer.compile(
|
| 104 |
+
DSPyRAG(),
|
| 105 |
+
trainset=trainset,
|
| 106 |
+
eval_kwargs={"num_threads": 1, "display_progress": True, "display_table": 0},
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Evaluate the optimized model on the development set
|
| 110 |
+
evaluate = Evaluate(
|
| 111 |
+
metric=DSPyEvaluator.llm_metric(),
|
| 112 |
+
devset=devset,
|
| 113 |
+
num_threads=1,
|
| 114 |
+
display_progress=True,
|
| 115 |
+
display_table=5,
|
| 116 |
+
)
|
| 117 |
+
evaluate(optimized_compiled_rag)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
main()
|
src/rag_pipelines/pipelines/dspy_rag.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
import dspy
|
| 4 |
+
import weave
|
| 5 |
+
from dspy import LM, Module
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DSPyRAGPipeline(weave.Model):
|
| 9 |
+
"""A class representing a Retrieval-Augmented Generation (RAG) model using DSPy.
|
| 10 |
+
|
| 11 |
+
Attributes:
|
| 12 |
+
llm (LM): The language model used for generating predictions.
|
| 13 |
+
rag_module (Module): The module used for retrieval tasks.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
llm: LM
|
| 17 |
+
rag_module: Module
|
| 18 |
+
|
| 19 |
+
def __init__(self, llm: LM, rag_module: Module) -> None:
|
| 20 |
+
"""Initialize the DSPyRAG model.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
llm (LM): The language model to be used.
|
| 24 |
+
rag_module (Module): The module to be used for retrieval tasks.
|
| 25 |
+
"""
|
| 26 |
+
super().__init__(llm=llm, rag_module=rag_module)
|
| 27 |
+
|
| 28 |
+
self.llm = llm
|
| 29 |
+
self.rag_module = rag_module
|
| 30 |
+
|
| 31 |
+
dspy.configure(lm=llm)
|
| 32 |
+
|
| 33 |
+
@weave.op()
|
| 34 |
+
def predict(self, input: str) -> dict[str, Any]:
|
| 35 |
+
"""Predicts the answer to a given question using the RAG approach.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
input (str): The question to be answered.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
Dict[str, Any]: A dictionary containing the answer and the context.
|
| 42 |
+
- "answer" (str): The predicted answer to the question.
|
| 43 |
+
- "context" (Any): The context used by the RAG module.
|
| 44 |
+
"""
|
| 45 |
+
prediction = self.rag_module(input)
|
| 46 |
+
|
| 47 |
+
return {"output": prediction.answer, "retrieval_context": prediction.retrieval_context}
|
src/rag_pipelines/pipelines/dspy_rag_module.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
from dspy import ChainOfThought, Module, Prediction
|
| 4 |
+
|
| 5 |
+
from rag_pipelines.evaluation import retrieval
|
| 6 |
+
from rag_pipelines.prompts import GenerateAnswerFromContext
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class RAG(Module):
|
| 10 |
+
"""Retrieval-Augmented Generation (RAG) module that retrieves context based on a question and generates an answer using that context."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, retriever: Any):
|
| 13 |
+
"""Initialize the RAG module.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
retriever (Any): An object that has a `question` method returning
|
| 17 |
+
a `passages` attribute. Typically, this would be a retriever like
|
| 18 |
+
a Milvus Retriever.
|
| 19 |
+
"""
|
| 20 |
+
super().__init__()
|
| 21 |
+
self.retrieve = retriever
|
| 22 |
+
self.generate_answer = ChainOfThought(GenerateAnswerFromContext)
|
| 23 |
+
|
| 24 |
+
def forward(self, question: str) -> Prediction:
|
| 25 |
+
"""Process a question by retrieving context and generating an answer.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
question (str): The question to be answered.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Prediction: A Prediction object containing the context and the generated answer.
|
| 32 |
+
"""
|
| 33 |
+
# Use the retriever to get context for the question.
|
| 34 |
+
context = self.retrieve(question).passages
|
| 35 |
+
# Generate an answer using the retrieved context and the question.
|
| 36 |
+
prediction = self.generate_answer(context=context, question=question)
|
| 37 |
+
# Return a Prediction object with the context and answer.
|
| 38 |
+
|
| 39 |
+
return Prediction(retrieval_context=[item.long_text for item in context], answer=prediction.answer)
|
src/rag_pipelines/pipelines/rag.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Optional
|
| 3 |
+
|
| 4 |
+
import weave
|
| 5 |
+
from langchain_community.retrievers import PineconeHybridSearchRetriever
|
| 6 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 7 |
+
from langchain_core.prompts.chat import ChatPromptTemplate
|
| 8 |
+
from langchain_core.runnables import RunnablePassthrough
|
| 9 |
+
from langchain_groq import ChatGroq
|
| 10 |
+
from weave import Model
|
| 11 |
+
from weave.integrations.langchain import WeaveTracer
|
| 12 |
+
|
| 13 |
+
# Disable global tracing explicitly
|
| 14 |
+
os.environ["WEAVE_TRACE_LANGCHAIN"] = "false"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class RAGPipeline(Model):
|
| 18 |
+
"""A hybrid retrieval-augmented generation (RAG) pipeline using Weave for tracing and LangChain components.
|
| 19 |
+
|
| 20 |
+
This pipeline integrates a retriever, prompt template, and language model (LLM) to implement a retrieval-augmented
|
| 21 |
+
generation system, where the LLM generates answers based on both retrieved documents and a prompt template.
|
| 22 |
+
Weave is used for tracing to monitor the pipeline's execution.
|
| 23 |
+
|
| 24 |
+
Attributes:
|
| 25 |
+
retriever: The retrieval model used to fetch relevant documents based on a query.
|
| 26 |
+
prompt: The prompt template to generate questions for the LLM.
|
| 27 |
+
llm: The language model used to generate responses.
|
| 28 |
+
tracer: The tracer used to record the execution details with Weave.
|
| 29 |
+
tracing_project_name: The name of the Weave project for tracing.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
retriever: Optional[PineconeHybridSearchRetriever] = None
|
| 33 |
+
prompt: Optional[ChatPromptTemplate] = None
|
| 34 |
+
llm: Optional[ChatGroq] = None
|
| 35 |
+
tracing_project_name: str
|
| 36 |
+
weave_params: dict[str, Any]
|
| 37 |
+
tracer: Optional[WeaveTracer] = None
|
| 38 |
+
|
| 39 |
+
def __init__(self, retriever, prompt, llm, tracing_project_name="hybrid_rag", weave_params=None):
|
| 40 |
+
"""Initialize the HybridRAGPipeline.
|
| 41 |
+
|
| 42 |
+
This constructor sets up the retriever, prompt, LLM, and integrates Weave tracing if specified.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
retriever: The retrieval model used to fetch documents for the RAG pipeline.
|
| 46 |
+
prompt: The prompt template used to create questions for the LLM.
|
| 47 |
+
llm: The language model used for response generation based on retrieved documents and prompt.
|
| 48 |
+
tracing_project_name (str): The name of the Weave project for tracing. Defaults to "hybrid_rag".
|
| 49 |
+
weave_params (dict): Additional parameters for initializing Weave. This can include configuration
|
| 50 |
+
details or authentication settings for the Weave service.
|
| 51 |
+
"""
|
| 52 |
+
super().__init__(
|
| 53 |
+
retriever=retriever,
|
| 54 |
+
prompt=prompt,
|
| 55 |
+
llm=llm,
|
| 56 |
+
tracing_project_name=tracing_project_name,
|
| 57 |
+
weave_params=weave_params,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if weave_params is None:
|
| 61 |
+
weave_params = {}
|
| 62 |
+
|
| 63 |
+
self.retriever = retriever
|
| 64 |
+
self.prompt = prompt
|
| 65 |
+
self.llm = llm
|
| 66 |
+
self.tracing_project_name = tracing_project_name
|
| 67 |
+
|
| 68 |
+
# Initialize Weave tracing if parameters are provided, otherwise default initialization.
|
| 69 |
+
if weave_params:
|
| 70 |
+
self._initialize_weave(**weave_params)
|
| 71 |
+
else:
|
| 72 |
+
self._initialize_weave()
|
| 73 |
+
|
| 74 |
+
def _initialize_weave(self, **weave_params):
|
| 75 |
+
"""Initialize Weave with the specified tracing project name.
|
| 76 |
+
|
| 77 |
+
This method sets up the Weave environment and creates an instance of the WeaveTracer.
|
| 78 |
+
The tracer records the execution of each step in the RAG pipeline for monitoring and debugging purposes.
|
| 79 |
+
"""
|
| 80 |
+
# Initialize the Weave project
|
| 81 |
+
weave.init(self.tracing_project_name, **weave_params)
|
| 82 |
+
# Set up the tracer for tracking pipeline execution
|
| 83 |
+
self.tracer = WeaveTracer()
|
| 84 |
+
|
| 85 |
+
@weave.op()
|
| 86 |
+
def predict(self, question: str) -> str:
|
| 87 |
+
"""Execute the Hybrid RAG pipeline with the given question.
|
| 88 |
+
|
| 89 |
+
This method orchestrates the entire RAG pipeline. It first retrieves documents using the retriever,
|
| 90 |
+
formats them, generates a question using the prompt template, and then processes the final response
|
| 91 |
+
using the LLM. The process is traced using Weave for debugging and monitoring.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
question (str): The input question to be answered by the pipeline.
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
str: The answer generated by the LLM based on the retrieved documents and the question prompt.
|
| 98 |
+
"""
|
| 99 |
+
# Configuration for trace callbacks to record the execution process
|
| 100 |
+
config = {"callbacks": [self.tracer]}
|
| 101 |
+
|
| 102 |
+
# Set up the RAG pipeline chain with document retrieval, formatting, prompting, LLM, and output parsing
|
| 103 |
+
rag_chain = (
|
| 104 |
+
{
|
| 105 |
+
"context": self.retriever | self.format_docs,
|
| 106 |
+
"question": RunnablePassthrough(),
|
| 107 |
+
}
|
| 108 |
+
| self.prompt
|
| 109 |
+
| self.llm
|
| 110 |
+
| StrOutputParser()
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Invoke the pipeline with the specified question and configuration
|
| 114 |
+
return rag_chain.invoke(question, config=config)
|
| 115 |
+
|
| 116 |
+
def format_docs(self, docs):
|
| 117 |
+
"""Format retrieved documents into a string for input to the LLM.
|
| 118 |
+
|
| 119 |
+
The documents are formatted with information such as filing date, accession number, summary, and image
|
| 120 |
+
descriptions.
|
| 121 |
+
This string will be passed as the context for the LLM to generate a response.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
docs (list): A list of document objects that have been retrieved based on the input question.
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
str: A formatted string of document contents, joined by newline characters.
|
| 128 |
+
"""
|
| 129 |
+
context = ""
|
| 130 |
+
for doc in docs:
|
| 131 |
+
date = doc.metadata["filing_date"]
|
| 132 |
+
accession_no = doc.metadata["accession_no"]
|
| 133 |
+
summary = doc.metadata["summary"]
|
| 134 |
+
image_descriptions = doc.metadata["image_descriptions"]
|
| 135 |
+
context += (
|
| 136 |
+
f"""# Report {accession_no} filed on {date}:\n\n## An excerpt from the report"""
|
| 137 |
+
f"""\n\n{doc.page_content}\n\n"""
|
| 138 |
+
)
|
| 139 |
+
if len(image_descriptions) > 0:
|
| 140 |
+
context += f"""## Image descriptions\n\n{image_descriptions}\n\n"""
|
| 141 |
+
context += (
|
| 142 |
+
f"""## Summary of the report\n\nHere's a summary of the report along with the some """
|
| 143 |
+
f"""important keywords and phrases present in the report:\n\n{summary}\n\n"""
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
return context
|