awinml commited on
Commit
336f4a9
·
verified ·
1 Parent(s): a29687e

Upload 107 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +118 -0
  2. pyproject.toml +202 -0
  3. src/rag_pipelines/__init__.py +0 -0
  4. src/rag_pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
  5. src/rag_pipelines/embeddings/__init__.py +6 -0
  6. src/rag_pipelines/embeddings/__pycache__/__init__.cpython-310.pyc +0 -0
  7. src/rag_pipelines/embeddings/__pycache__/dense.cpython-310.pyc +0 -0
  8. src/rag_pipelines/embeddings/__pycache__/sparse_fastembed_qdrant.cpython-310.pyc +0 -0
  9. src/rag_pipelines/embeddings/__pycache__/sparse_milvus.cpython-310.pyc +0 -0
  10. src/rag_pipelines/embeddings/__pycache__/sparse_pinecone_text.cpython-310.pyc +0 -0
  11. src/rag_pipelines/embeddings/dense.py +85 -0
  12. src/rag_pipelines/embeddings/sparse_fastembed_qdrant.py +57 -0
  13. src/rag_pipelines/embeddings/sparse_milvus.py +67 -0
  14. src/rag_pipelines/embeddings/sparse_pinecone_text.py +58 -0
  15. src/rag_pipelines/evaluation/__init__.py +19 -0
  16. src/rag_pipelines/evaluation/evaluator.py +54 -0
  17. src/rag_pipelines/evaluation/response/__init__.py +0 -0
  18. src/rag_pipelines/evaluation/response/answer_relevancy.py +152 -0
  19. src/rag_pipelines/evaluation/response/faithfulness.py +132 -0
  20. src/rag_pipelines/evaluation/response/hallucination.py +127 -0
  21. src/rag_pipelines/evaluation/response/phoenix_hallucination.py +107 -0
  22. src/rag_pipelines/evaluation/response/summarization.py +158 -0
  23. src/rag_pipelines/evaluation/retrieval/__init__.py +0 -0
  24. src/rag_pipelines/evaluation/retrieval/contextual_precision.py +160 -0
  25. src/rag_pipelines/evaluation/retrieval/contextual_recall.py +127 -0
  26. src/rag_pipelines/evaluation/retrieval/contextual_relevancy.py +125 -0
  27. src/rag_pipelines/llms/__init__.py +3 -0
  28. src/rag_pipelines/llms/__pycache__/__init__.cpython-310.pyc +0 -0
  29. src/rag_pipelines/llms/__pycache__/groq.cpython-310.pyc +0 -0
  30. src/rag_pipelines/llms/groq.py +99 -0
  31. src/rag_pipelines/pipelines/__init__.py +3 -0
  32. src/rag_pipelines/pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
  33. src/rag_pipelines/pipelines/__pycache__/self_rag.cpython-310.pyc +0 -0
  34. src/rag_pipelines/pipelines/__pycache__/self_rag_graph_state.cpython-310.pyc +0 -0
  35. src/rag_pipelines/pipelines/adaptive_rag.py +0 -0
  36. src/rag_pipelines/pipelines/adaptive_rag_graph_state.py +18 -0
  37. src/rag_pipelines/pipelines/crag.py +172 -0
  38. src/rag_pipelines/pipelines/crag_graph_state.py +17 -0
  39. src/rag_pipelines/pipelines/dspy/dspy_baseline_rag.py +46 -0
  40. src/rag_pipelines/pipelines/dspy/dspy_bayesian_signature_optimization_rag.py +124 -0
  41. src/rag_pipelines/pipelines/dspy/dspy_bootstrap_few_shot_optimization_rag.py +60 -0
  42. src/rag_pipelines/pipelines/dspy/dspy_copro_instruction_signature_optimization_rag.py +150 -0
  43. src/rag_pipelines/pipelines/dspy_baseline_rag.py +81 -0
  44. src/rag_pipelines/pipelines/dspy_bayesian_signature_optimization_rag.py +119 -0
  45. src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_optimization_rag.py +91 -0
  46. src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_random_search_optimization_rag.py +103 -0
  47. src/rag_pipelines/pipelines/dspy_copro_instruction_signature_optimization_rag.py +121 -0
  48. src/rag_pipelines/pipelines/dspy_rag.py +47 -0
  49. src/rag_pipelines/pipelines/dspy_rag_module.py +39 -0
  50. src/rag_pipelines/pipelines/rag.py +146 -0
.gitignore ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local run files
2
+ qa.db
3
+ **/qa.db
4
+ **/*qa*.db
5
+ **/test-reports
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ /pycache/*
10
+ **/pycache/*
11
+ */*/pycache/*
12
+ */*/*/pycache/*
13
+ */*/*/*/pycache/*
14
+ *.py[cod]
15
+ *$py.class
16
+
17
+ # C extensions
18
+ *.so
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ pip-wheel-metadata/
35
+ share/python-wheels/
36
+ *.egg-info/
37
+ .installed.cfg
38
+ *.egg
39
+ MANIFEST
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # PyBuilder
73
+ target/
74
+
75
+ # Jupyter Notebook
76
+ .ipynb_checkpoints
77
+
78
+ # IPython
79
+ profile_default/
80
+ ipython_config.py
81
+
82
+ # pyenv
83
+ .python-version
84
+
85
+ # pyflow
86
+ __pypackages__/
87
+
88
+ # Environments
89
+ .env
90
+ .venv
91
+ env/
92
+ venv/
93
+ ENV/
94
+ env.bak/
95
+ venv.bak/
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # Pyre type checker
106
+ .pyre/
107
+
108
+ # PyCharm
109
+ .idea
110
+
111
+ # VSCode
112
+ .vscode
113
+
114
+ # http cache (requests-cache)
115
+ **/http_cache.sqlite
116
+
117
+ # ruff
118
+ .ruff_cache
pyproject.toml ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "rag-pipelines"
7
+ version = "0.0.1"
8
+ description = 'Advanced Retrieval Augmented Generation Pipelines'
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ keywords = []
13
+ authors = [
14
+ { name = "Ashwin Mathur", email = "" },
15
+ { name = "Varun Mathur", email = "" },
16
+ ]
17
+ classifiers = [
18
+ "License :: OSI Approved :: MIT License",
19
+ "Development Status :: 4 - Beta",
20
+ "Programming Language :: Python",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: Implementation :: CPython",
25
+ "Programming Language :: Python :: Implementation :: PyPy",
26
+ ]
27
+ dependencies = [
28
+ "dataloaders @ git+https://github.com/avnlp/dataloaders.git",
29
+ "langchain-core",
30
+ "langgraph",
31
+ "langchain-text-splitters",
32
+ "langchain-experimental",
33
+ "langchain-huggingface",
34
+ "langchain-groq",
35
+ "langchain_milvus",
36
+ "langchain-qdrant",
37
+ "langchain-pinecone",
38
+ "langchain-voyageai",
39
+ "spladerunner",
40
+ "haystack-ai",
41
+ "weave",
42
+ "edgartools",
43
+ "fastembed",
44
+ "pinecone-text[splade]",
45
+ "unstructured[pdf]",
46
+ "deepeval",
47
+ "arize-phoenix",
48
+ "dspy",
49
+ "dspy-ai[milvus]",
50
+ "optimum[onnxruntime]",
51
+ ]
52
+
53
+ [project.optional-dependencies]
54
+ dev = ["pytest"]
55
+
56
+ [project.urls]
57
+ Documentation = "https://github.com/avnlp/rag-pipelines#readme"
58
+ Issues = "https://github.com/avnlp/rag-pipelines/issues"
59
+ Source = "https://github.com/avnlp/rag-pipelines"
60
+
61
+ [tool.hatch.metadata]
62
+ allow-direct-references = true
63
+
64
+ [tool.hatch.build.targets.wheel]
65
+ packages = ["src/rag_pipelines"]
66
+
67
+ [tool.hatch.envs.default]
68
+ installer = "uv"
69
+ dependencies = [
70
+ "coverage[toml]>=6.5",
71
+ "pytest",
72
+ "pytest-rerunfailures",
73
+ "pytest-mock",
74
+ ]
75
+
76
+ [tool.hatch.envs.default.scripts]
77
+ test = "pytest -vv {args:tests}"
78
+ test-cov = "coverage run -m pytest {args:tests}"
79
+ test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x"
80
+ cov-report = ["- coverage combine", "coverage report"]
81
+ cov = ["test-cov", "cov-report"]
82
+ cov-retry = ["test-cov-retry", "cov-report"]
83
+
84
+ [[tool.hatch.envs.test.matrix]]
85
+ python = ["39", "310", "311"]
86
+
87
+ [tool.hatch.envs.lint]
88
+ installer = "uv"
89
+ detached = true
90
+ dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
91
+
92
+ [tool.hatch.envs.lint.scripts]
93
+ typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
94
+ style = ["ruff check {args:}", "black --check --diff {args:.}"]
95
+ fmt = ["black {args:.}", "ruff check --fix --unsafe-fixes {args:}", "style"]
96
+ all = ["style", "typing"]
97
+
98
+ [tool.coverage.run]
99
+ source = ["rag_pipelines"]
100
+ branch = true
101
+ parallel = true
102
+
103
+ [tool.coverage.report]
104
+ omit = ["*/tests/*", "*/__init__.py"]
105
+ show_missing = true
106
+ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
107
+
108
+ [tool.ruff]
109
+ target-version = "py39"
110
+ line-length = 120
111
+
112
+ [tool.ruff.lint]
113
+ select = [
114
+ "A",
115
+ "ARG",
116
+ "B",
117
+ "C",
118
+ "D",
119
+ "D401",
120
+ "DTZ",
121
+ "E",
122
+ "EM",
123
+ "F",
124
+ "I",
125
+ "ICN",
126
+ "ISC",
127
+ "N",
128
+ "PLC",
129
+ "PLE",
130
+ "PLR",
131
+ "PLW",
132
+ "Q",
133
+ "RUF",
134
+ "S",
135
+ "T",
136
+ "TID",
137
+ "UP",
138
+ "W",
139
+ "YTT",
140
+ ]
141
+ ignore = [
142
+ # Allow non-abstract empty methods in abstract base classes
143
+ "B027",
144
+ # Allow boolean positional values in function calls, like `dict.get(... True)`
145
+ "FBT003",
146
+ # Ignore checks for possible passwords
147
+ "S102",
148
+ "S105",
149
+ "S106",
150
+ "S107",
151
+ # Ignore complexity
152
+ "C901",
153
+ "PLR0911",
154
+ "PLR0912",
155
+ "PLR0913",
156
+ "PLR0915",
157
+ # Allow print statements
158
+ "T201",
159
+ # Ignore missing module docstrings
160
+ "D100",
161
+ "D104",
162
+ # Ignore Line too long
163
+ "E501",
164
+ # Ignore builtin argument shadowing
165
+ "A002",
166
+ # Ignore builtin module shadowing
167
+ "A005",
168
+ # Ignore Function calls in argument defaults
169
+ "B008",
170
+ "ARG002",
171
+ "ARG005",
172
+ ]
173
+ unfixable = [
174
+ # Don't touch unused imports
175
+ "F401",
176
+ ]
177
+
178
+ [tool.ruff.lint.pydocstyle]
179
+ convention = "google"
180
+
181
+ [tool.ruff.lint.isort]
182
+ known-first-party = ["rag_pipelines"]
183
+
184
+ [tool.ruff.lint.flake8-tidy-imports]
185
+ ban-relative-imports = "parents"
186
+
187
+ [tool.ruff.lint.per-file-ignores]
188
+ # Tests can use magic values, assertions, and relative imports
189
+ "tests/**/*" = ["PLR2004", "S101", "TID252"]
190
+
191
+ [tool.pytest.ini_options]
192
+ minversion = "6.0"
193
+ addopts = "--strict-markers"
194
+ markers = ["integration: integration tests"]
195
+ log_cli = true
196
+
197
+ [tool.black]
198
+ line-length = 120
199
+
200
+ [[tool.mypy.overrides]]
201
+ module = ["rag_pipelines.*", "pytest.*", "numpy.*"]
202
+ ignore_missing_imports = true
src/rag_pipelines/__init__.py ADDED
File without changes
src/rag_pipelines/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (164 Bytes). View file
 
src/rag_pipelines/embeddings/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from rag_pipelines.embeddings.dense import DenseEmbeddings
2
+ from rag_pipelines.embeddings.sparse_fastembed_qdrant import SparseEmbeddings
3
+ from rag_pipelines.embeddings.sparse_milvus import SparseEmbeddingsMilvus
4
+ from rag_pipelines.embeddings.sparse_pinecone_text import SparseEmbeddingsSplade
5
+
6
+ __all__ = ["DenseEmbeddings", "SparseEmbeddings", "SparseEmbeddingsMilvus", "SparseEmbeddingsSplade"]
src/rag_pipelines/embeddings/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (552 Bytes). View file
 
src/rag_pipelines/embeddings/__pycache__/dense.cpython-310.pyc ADDED
Binary file (3.36 kB). View file
 
src/rag_pipelines/embeddings/__pycache__/sparse_fastembed_qdrant.cpython-310.pyc ADDED
Binary file (2.73 kB). View file
 
src/rag_pipelines/embeddings/__pycache__/sparse_milvus.cpython-310.pyc ADDED
Binary file (3.37 kB). View file
 
src/rag_pipelines/embeddings/__pycache__/sparse_pinecone_text.cpython-310.pyc ADDED
Binary file (2.61 kB). View file
 
src/rag_pipelines/embeddings/dense.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional
2
+
3
+ import weave
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+
6
+
7
+ class DenseEmbeddings(weave.Model):
8
+ """Generate dense embeddings for documents and queries using a specified SentenceTransformer model.
9
+
10
+ This class leverages HuggingFace's `HuggingFaceEmbeddings` to compute dense embeddings for input text.
11
+
12
+ Attributes:
13
+ model_name (str): The name of the pre-trained embedding model to use.
14
+ model_kwargs (Optional[Dict[str, Any]]): Additional configuration parameters for the embedding model.
15
+ encode_kwargs (Optional[Dict[str, Any]]): Parameters for fine-tuning the behavior of the encoding process.
16
+ embedding_model (HuggingFaceEmbeddings): The initialized HuggingFace embeddings model with the specified settings.
17
+ """
18
+
19
+ model_name: str
20
+ model_kwargs: Optional[dict[str, Any]]
21
+ encode_kwargs: Optional[dict[str, Any]]
22
+ show_progress: bool
23
+ embedding_model: Optional[HuggingFaceEmbeddings] = None
24
+
25
+ def __init__(
26
+ self,
27
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
28
+ model_kwargs: Optional[dict[str, Any]] = None,
29
+ encode_kwargs: Optional[dict[str, Any]] = None,
30
+ show_progress: bool = True,
31
+ ):
32
+ """Initialize the DenseEmbeddings class with the specified model and configurations.
33
+
34
+ Args:
35
+ model_name (str): The name of the pre-trained embedding model. Defaults to "sentence-transformers/all-MiniLM-L6-v2".
36
+ model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
37
+ encode_kwargs (Optional[dict[str, Any]]): Parameters for encoding settings. Defaults to None.
38
+ show_progress (bool): Whether to display progress during model operations. Defaults to True.
39
+ """
40
+ if encode_kwargs is None:
41
+ encode_kwargs = {"normalize_embeddings": True}
42
+ if model_kwargs is None:
43
+ model_kwargs = {"device": "cpu"}
44
+ super().__init__(
45
+ model_name=model_name,
46
+ model_kwargs=model_kwargs,
47
+ encode_kwargs=encode_kwargs,
48
+ show_progress=show_progress,
49
+ )
50
+
51
+ self.model_name = model_name
52
+ self.model_kwargs = model_kwargs if model_kwargs is not None else {}
53
+ self.encode_kwargs = encode_kwargs if encode_kwargs is not None else {}
54
+
55
+ # Initialize the embedding model with the specified parameters
56
+ self.embedding_model = HuggingFaceEmbeddings(
57
+ model_name=self.model_name,
58
+ model_kwargs=self.model_kwargs,
59
+ encode_kwargs=self.encode_kwargs,
60
+ show_progress=show_progress,
61
+ )
62
+
63
+ @weave.op()
64
+ def embed_texts(self, texts: list[str]) -> list[list[float]]:
65
+ """Embed a list of texts and return their embeddings.
66
+
67
+ Args:
68
+ texts (list[str]): A list of texts to embed.
69
+
70
+ Returns:
71
+ list[list[float]]: A list of embedding vectors corresponding to each input text.
72
+ """
73
+ return self.embedding_model.embed_documents(texts)
74
+
75
+ @weave.op()
76
+ def embed_query(self, text: str) -> list[float]:
77
+ """Embed a single query text and returns its embedding.
78
+
79
+ Args:
80
+ text (str): The query text to be embedded.
81
+
82
+ Returns:
83
+ List[float]: The embedding vector for the query text.
84
+ """
85
+ return self.embedding_model.embed_query(text)
src/rag_pipelines/embeddings/sparse_fastembed_qdrant.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional
2
+
3
+ import weave
4
+ from langchain_qdrant.fastembed_sparse import FastEmbedSparse
5
+
6
+
7
+ class SparseEmbeddings(weave.Model):
8
+ """Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
9
+
10
+ Attributes:
11
+ model_name (str): The name of the sparse embedding model to use.
12
+ model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
13
+ sparse_embedding_model (FastEmbedSparse): The initialized FastEmbedSparse model with the specified parameters.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ model_name: str = "prithvida/Splade_PP_en_v1",
19
+ model_kwargs: Optional[dict[str, Any]] = None,
20
+ ):
21
+ """Initialize the SparseEmbeddings class with the specified model and configurations.
22
+
23
+ Args:
24
+ model_name (str): The name of the sparse embedding model. Defaults to "prithvida/Splade_PP_en_v1".
25
+ model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
26
+ """
27
+ self.model_name = model_name
28
+ self.model_kwargs = model_kwargs if model_kwargs is not None else {}
29
+
30
+ # Initialize the sparse embedding model with specified parameters
31
+ self.sparse_embedding_model = FastEmbedSparse(model_name=self.model_name, **self.model_kwargs)
32
+
33
+ @weave.op()
34
+ def embed_texts(self, texts: list[str]) -> list[dict[str, float]]:
35
+ """Embed a list of texts and return their sparse embeddings.
36
+
37
+ Args:
38
+ texts (list[str]): A list of document texts to embed.
39
+
40
+ Returns:
41
+ list[dict[str, float]]: A list of sparse embedding dictionaries for each document text.
42
+ Each dictionary maps terms to their corresponding weights.
43
+ """
44
+ return self.sparse_embedding_model.embed_documents(texts)
45
+
46
+ @weave.op()
47
+ def embed_query(self, text: str) -> dict[str, float]:
48
+ """Embed a single query text and return its sparse embedding.
49
+
50
+ Args:
51
+ text (str): The query text to embed.
52
+
53
+ Returns:
54
+ dict[str, float]: A sparse embedding dictionary for the query text, where keys are terms
55
+ and values are term weights.
56
+ """
57
+ return self.sparse_embedding_model.embed_query(text)
src/rag_pipelines/embeddings/sparse_milvus.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional
2
+
3
+ import weave
4
+ from langchain_milvus.utils.sparse import BaseSparseEmbedding
5
+ from spladerunner import Expander
6
+
7
+
8
+ class SparseEmbeddingsMilvus(BaseSparseEmbedding):
9
+ """Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
10
+
11
+ Attributes:
12
+ model_name (str): The name of the sparse embedding model to use.
13
+ model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
14
+ sparse_embedding_model (FastEmbedSparse): The initialized FastEmbedSparse model with the specified parameters.
15
+ """
16
+
17
+ model_name: str
18
+ model_kwargs: Optional[dict[str, Any]] = None
19
+ sparse_embedding_model: Optional[Any] = None
20
+
21
+ def __init__(
22
+ self,
23
+ model_name: str = "Splade_PP_en_v1",
24
+ max_length: int = 512,
25
+ ):
26
+ """Initialize the SparseEmbeddings class with the specified model and configurations.
27
+
28
+ Args:
29
+ model_name (str): The name of the sparse embedding model. Defaults to "Splade_PP_en_v1".
30
+ model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
31
+ """
32
+ self.model_name = model_name
33
+ self.max_length = max_length
34
+
35
+ # Initialize the sparse embedding model with specified parameters
36
+ self.sparse_embedding_model = Expander(model_name=self.model_name, max_length=self.max_length)
37
+
38
+ def _sparse_to_dict(self, sparse_vector: Any) -> dict[int, float]:
39
+ return dict(zip(sparse_vector["indices"], sparse_vector["values"]))
40
+
41
+ @weave.op()
42
+ def embed_query(self, text: str) -> dict[int, float]:
43
+ """Embed a single query text and return its sparse embedding.
44
+
45
+ Args:
46
+ text (str): The query text to embed.
47
+
48
+ Returns:
49
+ dict[int, float]: A sparse embedding dictionary for the query text, where keys are terms
50
+ and values are term weights.
51
+ """
52
+ sparse_embeddings = list(self.sparse_embedding_model.expand([text]))
53
+ return self._sparse_to_dict(sparse_embeddings[0])
54
+
55
+ @weave.op()
56
+ def embed_documents(self, texts: list[str]) -> list[dict[int, float]]:
57
+ """Embed a list of texts and return their sparse embeddings.
58
+
59
+ Args:
60
+ texts (list[str]): A list of document texts to embed.
61
+
62
+ Returns:
63
+ list[dict[int, float]]: A list of sparse embedding dictionaries for each document text.
64
+ Each dictionary maps terms to their corresponding weights.
65
+ """
66
+ sparse_embeddings = list(self.sparse_embedding_model.expand(texts))
67
+ return [self._sparse_to_dict(sparse_embeddings[i]) for i in range(len(texts))]
src/rag_pipelines/embeddings/sparse_pinecone_text.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional
2
+
3
+ import weave
4
+ from pinecone_text.sparse import SpladeEncoder
5
+
6
+
7
+ class SparseEmbeddingsSplade(weave.Model):
8
+ """Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
9
+
10
+ Attributes:
11
+ model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
12
+ sparse_embedding_model (SpladeEncoder): The FastEmbedSparse model initialized with the specified parameters.
13
+ """
14
+
15
+ model_kwargs: Optional[dict[str, Any]]
16
+ sparse_embedding_model: Optional[SpladeEncoder] = None
17
+
18
+ def __init__(
19
+ self,
20
+ model_kwargs: Optional[dict[str, Any]] = None,
21
+ ):
22
+ """Initialize the SparseEmbeddings class with the specified model and configurations.
23
+
24
+ Args:
25
+ model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization.
26
+ """
27
+ super().__init__(model_kwargs=model_kwargs)
28
+
29
+ self.model_kwargs = model_kwargs if model_kwargs is not None else {}
30
+
31
+ # Initialize the sparse embedding model with specified parameters
32
+ self.sparse_embedding_model = SpladeEncoder(**self.model_kwargs)
33
+
34
+ @weave.op()
35
+ def embed_texts(self, texts: list[str]) -> list[dict[str, float]]:
36
+ """Embed a list of texts and return their sparse embeddings.
37
+
38
+ Args:
39
+ texts (list[str]): A list of document texts to embed.
40
+
41
+ Returns:
42
+ list[dict[str, float]]: A list of sparse embedding dictionaries for each document text.
43
+ Each dictionary maps terms to their corresponding weights.
44
+ """
45
+ return self.sparse_embedding_model.encode_documents(texts)
46
+
47
+ @weave.op()
48
+ def embed_query(self, text: str) -> dict[str, float]:
49
+ """Embed a single query text and return its sparse embedding.
50
+
51
+ Args:
52
+ text (str): The query text to embed.
53
+
54
+ Returns:
55
+ dict[str, float]: A sparse embedding dictionary for the query text, where keys are terms
56
+ and values are term weights.
57
+ """
58
+ return self.sparse_embedding_model.encode_queries([text])
src/rag_pipelines/evaluation/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rag_pipelines.evaluation.evaluator import Evaluator
2
+ from rag_pipelines.evaluation.response.answer_relevancy import AnswerRelevancyScorer
3
+ from rag_pipelines.evaluation.response.faithfulness import FaithfulnessScorer
4
+ from rag_pipelines.evaluation.response.hallucination import HallucinationScorer
5
+ from rag_pipelines.evaluation.response.summarization import SummarizationScorer
6
+ from rag_pipelines.evaluation.retrieval.contextual_precision import ContextualPrecisionScorer
7
+ from rag_pipelines.evaluation.retrieval.contextual_recall import ContextualRecallScorer
8
+ from rag_pipelines.evaluation.retrieval.contextual_relevancy import ContextualRelevancyScorer
9
+
10
+ __all__ = [
11
+ "AnswerRelevancyScorer",
12
+ "ContextualPrecisionScorer",
13
+ "ContextualRecallScorer",
14
+ "ContextualRelevancyScorer",
15
+ "Evaluator",
16
+ "FaithfulnessScorer",
17
+ "HallucinationScorer",
18
+ "SummarizationScorer",
19
+ ]
src/rag_pipelines/evaluation/evaluator.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ from weave import Dataset, Evaluation, Model, Scorer
4
+
5
+
6
+ class Evaluator:
7
+ """Evaluate a model on a dataset using a list of scorers.
8
+
9
+ Attributes:
10
+ evaluation_name (str): The name of the evaluation run.
11
+ evaluation_dataset (Dataset): The dataset used for evaluation.
12
+ evaluation_scorers (list[Scorer]): A list of scorer objects used to evaluate the pipeline.
13
+ pipeline (Model): The pipeline (model) to be evaluated.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ evaluation_name: str,
19
+ evaluation_dataset: Dataset,
20
+ evaluation_scorers: list[Scorer],
21
+ pipeline: Model,
22
+ ):
23
+ """Initialize the Evaluator instance with the specified evaluation parameters.
24
+
25
+ Args:
26
+ evaluation_name (str): A unique identifier for the evaluation run.
27
+ evaluation_dataset (Dataset): A `Dataset` object representing the data for evaluation.
28
+ evaluation_scorers (list[Scorer]): A list of `Scorer` objects that calculate various metrics.
29
+ pipeline (Model): The model or pipeline to evaluate.
30
+ """
31
+ self.evaluation_name = evaluation_name
32
+ self.evaluation_dataset = evaluation_dataset
33
+ self.evaluation_scorers = evaluation_scorers
34
+ self.pipeline = pipeline
35
+
36
+ def evaluate(self) -> None:
37
+ """Perform evaluation of the pipeline using the specified dataset and scorers.
38
+
39
+ This method creates an `Evaluation` object, executes the evaluation process, and
40
+ returns the results as a dictionary.
41
+ """
42
+ evaluation = Evaluation(
43
+ evaluation_name=self.evaluation_name,
44
+ dataset=self.evaluation_dataset,
45
+ scorers=self.evaluation_scorers,
46
+ )
47
+
48
+ try:
49
+ evaluation_results = asyncio.run(evaluation.evaluate(self.pipeline))
50
+ except Exception as exception:
51
+ msg = f"Evaluation run failed: {exception}"
52
+ raise RuntimeError(msg) from exception
53
+
54
+ return evaluation_results
src/rag_pipelines/evaluation/response/__init__.py ADDED
File without changes
src/rag_pipelines/evaluation/response/answer_relevancy.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from statistics import variance
2
+ from typing import Optional, Union
3
+
4
+ import numpy as np
5
+ import weave
6
+ from deepeval.metrics import AnswerRelevancyMetric
7
+ from deepeval.test_case import LLMTestCase
8
+ from weave import Scorer
9
+
10
+
11
+ class AnswerRelevancyScorer(Scorer):
12
+ """Evaluate the relevancy of answers generated by a LLM.
13
+
14
+ This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
15
+ compared to the input query.
16
+
17
+ The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
18
+ actual output of an LLM application is in relation to the input query.
19
+
20
+ Attributes:
21
+ threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
22
+ model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
23
+ include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
24
+ strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
25
+ 1. Defaults to False.
26
+ async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
27
+ verbose (bool): Whether to print intermediate steps to the console, defaults to False.
28
+ metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
29
+ """
30
+
31
+ threshold: float = Optional[None]
32
+ model: str = Optional[None]
33
+ include_reason: bool = Optional[None]
34
+ strict_mode: bool = Optional[None]
35
+ async_mode: bool = Optional[None]
36
+ verbose: bool = Optional[None]
37
+ metric: AnswerRelevancyMetric = Optional[None]
38
+
39
+ def __init__(
40
+ self,
41
+ threshold: float = 0.5,
42
+ model: str = "gpt-4",
43
+ include_reason: bool = True,
44
+ strict_mode: bool = False,
45
+ async_mode: bool = True,
46
+ verbose: bool = False,
47
+ ):
48
+ """Initialize the AnswerRelevancy Scorer with the specified parameters.
49
+
50
+ Args:
51
+ threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
52
+ model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
53
+ include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
54
+ strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
55
+ async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
56
+ verbose (bool): Whether to print intermediate steps to the console, defaults to False.
57
+ """
58
+ super().__init__(
59
+ threshold=threshold,
60
+ model=model,
61
+ include_reason=include_reason,
62
+ strict_mode=strict_mode,
63
+ async_mode=async_mode,
64
+ verbose=verbose,
65
+ )
66
+
67
+ self.threshold = threshold
68
+ self.model = model
69
+ self.include_reason = include_reason
70
+ self.strict_mode = strict_mode
71
+ self.async_mode = async_mode
72
+ self.verbose = verbose
73
+
74
+ self.metric = AnswerRelevancyMetric(
75
+ threshold=self.threshold,
76
+ model=self.model,
77
+ include_reason=self.include_reason,
78
+ async_mode=self.async_mode,
79
+ strict_mode=self.strict_mode,
80
+ verbose_mode=self.verbose,
81
+ )
82
+
83
+ @weave.op
84
+ def score(
85
+ self,
86
+ input: str,
87
+ output: Optional[dict] = None,
88
+ expected_output: Optional[str] = None,
89
+ context: Optional[list[str]] = None,
90
+ ) -> dict[str, Union[str, float]]:
91
+ """Evaluate the relevancy and accuracy of answers generated by a LLM.
92
+
93
+ The AnswerRelevancy score is calculated according to the following equation:
94
+
95
+ Answer Relevancy = Total Number of Statements / Number of Relevant Statements
96
+
97
+ The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.
98
+
99
+
100
+ Args:
101
+ input (str): The input query or prompt that triggered the output.
102
+ output (dict): The LLM generated response to evaluate and the retrieval context.
103
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
104
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
105
+
106
+ Returns:
107
+ dict[str, Union[str, float]]: A dictionary containing:
108
+ - "score" (float): The computed answer relevancy score.
109
+ """
110
+ test_case = LLMTestCase(
111
+ input=input,
112
+ actual_output=output.get("output", ""),
113
+ expected_output=expected_output,
114
+ retrieval_context=output.get("retrieval_context", [""]),
115
+ context=context,
116
+ )
117
+
118
+ result: dict[str, Union[str, float]] = {}
119
+
120
+ self.metric.measure(test_case)
121
+ result = {"score": self.metric.score}
122
+
123
+ return result
124
+
125
+ @weave.op()
126
+ def summarize(self, score_rows: list) -> dict:
127
+ """Summarize the results of the AnswerRelevancy Scorer.
128
+
129
+ Args:
130
+ score_rows (list): A list of dictionaries containing the following keys:
131
+ - "score" (float): The computed answer relevancy score.
132
+ - "reason" (str): A detailed explanation for the assigned score.
133
+
134
+ Returns:
135
+ dict: A dictionary containing the following keys:
136
+ - "answer_relevancy_score" (dict): A dictionary containing the following keys:
137
+ - "score" (float): The average answer relevancy score.
138
+ - "variance" (float): The variance of the answer relevancy scores.
139
+ - "std" (float): The standard deviation of the answer relevancy scores.
140
+ - "count" (int): The number of answer relevancy scores.
141
+ """
142
+ scores = []
143
+ for row in score_rows:
144
+ score = row.get("score", 0.0)
145
+ scores.append(float(score))
146
+
147
+ score = np.mean(scores).item()
148
+ variance = np.var(scores).item()
149
+ std = np.std(scores).item()
150
+ count = len(scores)
151
+
152
+ return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}
src/rag_pipelines/evaluation/response/faithfulness.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import weave
4
+ from deepeval.metrics import FaithfulnessMetric
5
+ from deepeval.test_case import LLMTestCase
6
+ from weave import Scorer
7
+
8
+
9
+ class FaithfulnessScorer(Scorer):
10
+ """Evaluate the faithfulness of LLM generated outputs.
11
+
12
+ This scorer uses DeepEval's `Faithfulness` Metric.
13
+
14
+ The faithfulness metric measures the quality of your LLM generation by evaluating whether the `actual_output` factually aligns with the contents of your `retrieval_context`.
15
+
16
+ Attributes:
17
+ threshold (float): The minimum score required to pass the faithfulness check, defaults to 0.5.
18
+ model (str): The LLM model used for evaluation, defaults to "gpt-4".
19
+ include_reason (bool): Whether to include an explanation for the assigned score, defaults to True.
20
+ strict_mode (bool): When True, enforces binary scoring (1 for perfect alignment, 0 otherwise).
21
+ Overrides the threshold to 1. Defaults to False.
22
+ async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
23
+ verbose (bool): Whether to display intermediate steps during metric computation, defaults to False.
24
+ truths_extraction_limit (Optional[int]): Limits the number of key facts to extract from the retrieval
25
+ context for evaluation, ordered by importance. Defaults to None.
26
+ metric (FaithfulnessMetric): An instance of DeepEval's `FaithfulnessMetric` for scoring.
27
+ """
28
+
29
+ threshold: float = Optional[None]
30
+ model: str = Optional[None]
31
+ include_reason: bool = Optional[None]
32
+ strict_mode: bool = Optional[None]
33
+ async_mode: bool = Optional[None]
34
+ verbose: bool = Optional[None]
35
+ truths_extraction_limit: Optional[int] = Optional[None]
36
+ metric: FaithfulnessMetric = Optional[None]
37
+
38
+ def __init__(
39
+ self,
40
+ threshold: float = 0.5,
41
+ model: str = "gpt-4",
42
+ include_reason: bool = True,
43
+ strict_mode: bool = False,
44
+ async_mode: bool = True,
45
+ verbose: bool = False,
46
+ truths_extraction_limit: Optional[int] = None,
47
+ ):
48
+ """Initialize the Faithfulness Scorer with DeepEval's Faithfulness Metric.
49
+
50
+ Args:
51
+ threshold (float): The minimum score required to pass the faithfulness check, defaults to 0.5.
52
+ model (str): The LLM model used for evaluation, defaults to "gpt-4".
53
+ include_reason (bool): Whether to include an explanation for the assigned score, defaults to True.
54
+ strict_mode (bool): Enforces binary scoring (1 for perfect alignment, 0 otherwise).
55
+ Overrides the threshold to 1. Defaults to False.
56
+ async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
57
+ verbose (bool): Whether to display intermediate steps during metric computation, defaults to False.
58
+ truths_extraction_limit (Optional[int]): Limits the number of key facts to extract from the retrieval
59
+ context for evaluation, ordered by importance. Defaults to None.
60
+ """
61
+ super().__init__(
62
+ threshold=threshold,
63
+ model=model,
64
+ include_reason=include_reason,
65
+ strict_mode=strict_mode,
66
+ async_mode=async_mode,
67
+ verbose=verbose,
68
+ truths_extraction_limit=truths_extraction_limit,
69
+ )
70
+
71
+ self.threshold = threshold
72
+ self.model = model
73
+ self.include_reason = include_reason
74
+ self.strict_mode = strict_mode
75
+ self.async_mode = async_mode
76
+ self.verbose = verbose
77
+ self.truths_extraction_limit = truths_extraction_limit
78
+
79
+ self.metric = FaithfulnessMetric(
80
+ threshold=self.threshold,
81
+ model=self.model,
82
+ include_reason=self.include_reason,
83
+ async_mode=self.async_mode,
84
+ strict_mode=self.strict_mode,
85
+ verbose_mode=self.verbose,
86
+ )
87
+
88
+ @weave.op
89
+ def score(
90
+ self,
91
+ input: str,
92
+ actual_output: str,
93
+ expected_output: Optional[str] = None,
94
+ retrieval_context: Optional[list[str]] = None,
95
+ context: Optional[list[str]] = None,
96
+ ) -> dict[str, Union[str, float]]:
97
+ """Evaluate the faithfulness of an LLM generated response.
98
+
99
+ Faithfulness is calculated as:
100
+
101
+ Faithfulness = (Number of Truthful Claims) / (Total Number of Claims).
102
+
103
+ The Faithfulness Metric evaluates all claims in the `actual_output` and checks
104
+ whether they are truthful based on the facts in the `retrieval_context`. Claims
105
+ are marked truthful if they align with or do not contradict any facts in the context.
106
+
107
+ Args:
108
+ input (str): The input query or prompt that triggered the output.
109
+ actual_output (str): The LLM generated response to evaluate.
110
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
111
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
112
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
113
+
114
+ Returns:
115
+ dict[str, Union[str, float]]: A dictionary containing:
116
+ - "score" (float): The computed faithfulness score.
117
+ - "reason" (str): A detailed explanation for the assigned score.
118
+ """
119
+ test_case = LLMTestCase(
120
+ input=input,
121
+ actual_output=actual_output,
122
+ expected_output=expected_output,
123
+ retrieval_context=retrieval_context,
124
+ context=context,
125
+ )
126
+
127
+ result: dict[str, Union[str, float]] = {}
128
+
129
+ self.metric.measure(test_case)
130
+ result = {"score": self.metric.score, "reason": self.metric.reason}
131
+
132
+ return result
src/rag_pipelines/evaluation/response/hallucination.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import weave
4
+ from deepeval.metrics import HallucinationMetric
5
+ from deepeval.test_case import LLMTestCase
6
+ from weave import Scorer
7
+
8
+
9
+ class HallucinationScorer(Scorer):
10
+ """Evaluate the factual alignment of the generated output with the provided context.
11
+
12
+ This scorer uses DeepEval's `Hallucination` Metric to assess how well the generated output
13
+ aligns with the reference context.
14
+
15
+ The Hallucination metric determines whether your LLM generates factually correct information by comparing the `actual_output` to the provided `context`.
16
+
17
+ Attributes:
18
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
19
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
20
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
21
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
22
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
23
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
24
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
25
+ to False.
26
+ metric (HallucinationMetric): The DeepEval HallucinationMetric.
27
+ """
28
+
29
+ threshold: float = Optional[None]
30
+ model: str = Optional[None]
31
+ include_reason: bool = Optional[None]
32
+ strict_mode: bool = Optional[None]
33
+ async_mode: bool = Optional[None]
34
+ verbose: bool = Optional[None]
35
+ metric: HallucinationMetric = Optional[None]
36
+
37
+ def __init__(
38
+ self,
39
+ threshold: float = 0.5,
40
+ model: str = "gpt-4",
41
+ include_reason: bool = True,
42
+ strict_mode: bool = True,
43
+ async_mode: bool = True,
44
+ verbose: bool = False,
45
+ ):
46
+ """Initialize the Hallucination scorer using DeepEval's Hallucination Metric.
47
+
48
+ Args:
49
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
50
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
51
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
52
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
53
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
54
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
55
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
56
+ to False.
57
+ """
58
+ super().__init__(
59
+ threshold=threshold,
60
+ model=model,
61
+ include_reason=include_reason,
62
+ strict_mode=strict_mode,
63
+ async_mode=async_mode,
64
+ verbose=verbose,
65
+ )
66
+
67
+ self.threshold = threshold
68
+ self.model = model
69
+ self.include_reason = include_reason
70
+ self.strict_mode = strict_mode
71
+ self.async_mode = async_mode
72
+ self.verbose = verbose
73
+
74
+ self.metric = HallucinationMetric(
75
+ threshold=self.threshold,
76
+ model=self.model,
77
+ include_reason=self.include_reason,
78
+ async_mode=self.async_mode,
79
+ strict_mode=self.strict_mode,
80
+ verbose_mode=self.verbose,
81
+ )
82
+
83
+ @weave.op
84
+ def score(
85
+ self,
86
+ input: str,
87
+ actual_output: str,
88
+ expected_output: Optional[str] = None,
89
+ retrieval_context: Optional[list[str]] = None,
90
+ context: Optional[list[str]] = None,
91
+ ) -> dict[str, Union[str, float]]:
92
+ """Evaluate the factual alignment of the generated output with the provided context.
93
+
94
+ The Hallucination Score is calculated according to the following equation:
95
+
96
+ Hallucination = Number of Contradicted Contexts / Total Number of Contexts
97
+
98
+ The Hallucination Score uses an LLM to determine, for each context in `contexts`, whether there are any contradictions to the `actual_output`.
99
+
100
+ Although extremely similar to the Faithfulness Scorer, the Hallucination Score is calculated differently since it uses `contexts` as the source of truth instead. Since `contexts` is the ideal segment of your knowledge base relevant to a specific input, the degree of hallucination can be measured by the degree of which the `contexts` is disagreed upon.
101
+
102
+ Args:
103
+ input (str): The input query or prompt that triggered the output.
104
+ actual_output (str): The LLM generated response to evaluate.
105
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
106
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
107
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
108
+
109
+ Returns:
110
+ dict[str, Union[str, float]]: A dictionary containing:
111
+ - "score" (float): The computed hallucination score.
112
+ - "reason" (str): A detailed explanation for the assigned score.
113
+ """
114
+ test_case = LLMTestCase(
115
+ input=input,
116
+ actual_output=actual_output,
117
+ expected_output=expected_output,
118
+ retrieval_context=retrieval_context,
119
+ context=context,
120
+ )
121
+
122
+ result: dict[str, Union[str, float]] = {}
123
+
124
+ self.metric.measure(test_case)
125
+ result = {"score": self.metric.score, "reason": self.metric.reason}
126
+
127
+ return result
src/rag_pipelines/evaluation/response/phoenix_hallucination.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import weave
4
+ from deepeval.metrics import AnswerRelevancyMetric
5
+ from deepeval.test_case import LLMTestCase
6
+ from weave import Scorer
7
+
8
+
9
+ class AnswerRelevancyScorer(Scorer):
10
+ """Evaluate the relevancy of answers generated by a LLM.
11
+
12
+ This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
13
+ compared to the input query.
14
+
15
+ The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
16
+ actual output of an LLM application is in relation to the input query.
17
+
18
+ Attributes:
19
+ threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
20
+ model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
21
+ include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
22
+ strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
23
+ 1. Defaults to False.
24
+ async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
25
+ verbose (bool): Whether to print intermediate steps to the console, defaults to False.
26
+ metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ threshold: float = 0.5,
32
+ model: str = "gpt-4",
33
+ include_reason: bool = True,
34
+ strict_mode: bool = False,
35
+ async_mode: bool = True,
36
+ verbose: bool = False,
37
+ ):
38
+ """Initialize the AnswerRelevancy Scorer with the specified parameters.
39
+
40
+ Args:
41
+ threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
42
+ model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
43
+ include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
44
+ strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
45
+ async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
46
+ verbose (bool): Whether to print intermediate steps to the console, defaults to False.
47
+ """
48
+ self.threshold = threshold
49
+ self.model = model
50
+ self.include_reason = include_reason
51
+ self.strict_mode = strict_mode
52
+ self.async_mode = async_mode
53
+ self.verbose = verbose
54
+
55
+ self.metric = AnswerRelevancyMetric(
56
+ threshold=self.threshold,
57
+ model=self.model,
58
+ include_reason=self.include_reason,
59
+ async_mode=self.async_mode,
60
+ strict_mode=self.strict_mode,
61
+ verbose=self.verbose,
62
+ )
63
+
64
+ @weave.op
65
+ def score(
66
+ self,
67
+ input: str,
68
+ actual_output: str,
69
+ expected_output: Optional[str] = None,
70
+ retrieval_context: Optional[list[str]] = None,
71
+ context: Optional[list[str]] = None,
72
+ ) -> dict[str, Union[str, float]]:
73
+ """Evaluate the relevancy and accuracy of answers generated by a LLM.
74
+
75
+ The AnswerRelevancy score is calculated according to the following equation:
76
+
77
+ Answer Relevancy = Total Number of Statements / Number of Relevant Statements
78
+
79
+ The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.
80
+
81
+
82
+ Args:
83
+ input (str): The input query or prompt that triggered the output.
84
+ actual_output (str): The LLM generated response to evaluate.
85
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
86
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
87
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
88
+
89
+ Returns:
90
+ dict[str, Union[str, float]]: A dictionary containing:
91
+ - "score" (float): The computed answer relevancy score.
92
+ - "reason" (str): A detailed explanation for the assigned score.
93
+ """
94
+ test_case = LLMTestCase(
95
+ input=input,
96
+ actual_output=actual_output,
97
+ expected_output=expected_output,
98
+ retrieval_context=retrieval_context,
99
+ context=context,
100
+ )
101
+
102
+ result: dict[str, Union[str, float]] = {}
103
+
104
+ self.metric.measure(test_case)
105
+ result = {"score": self.metric.score, "reason": self.metric.reason}
106
+
107
+ return result
src/rag_pipelines/evaluation/response/summarization.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import weave
4
+ from deepeval.metrics import SummarizationMetric
5
+ from deepeval.test_case import LLMTestCase
6
+ from weave import Scorer
7
+
8
+
9
+ class SummarizationScorer(Scorer):
10
+ """Summarization Scorer.
11
+
12
+ This scorer uses DeepEval's `Summarization` Metric to assess how well the generated output
13
+ aligns with the reference context.
14
+
15
+ The summarization metric uses LLMs to determine whether the LLM application is generating factually correct
16
+ summaries while including the neccessary details from the original text.
17
+
18
+ Attributes:
19
+ threshold (float): Minimum passing threshold, defaults to 0.5.
20
+ model (str): LLM model for scoring, defaults to "gpt-4".
21
+ assessment_questions: a list of close-ended questions that can be answered with either a 'yes' or a 'no'.
22
+ These are questions you want your summary to be able to ideally answer,
23
+ and is especially helpful if you already know what a good summary for your use case looks like. If
24
+ include_reason (bool): Include reason for the evaluation score, defaults to True.
25
+ strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
26
+ async_mode (bool): Use asynchronous scoring, defaults to True.
27
+ verbose (bool): Print intermediate steps used for scoring, defaults to False.
28
+ truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
29
+ from the retrieval_context. Defaults to None.
30
+ metric (SummarizationMetric): An instance of DeepEval's `SummarizationMetric` for scoring.
31
+ """
32
+
33
+ threshold: float = Optional[None]
34
+ model: str = Optional[None]
35
+ include_reason: bool = Optional[None]
36
+ strict_mode: bool = Optional[None]
37
+ async_mode: bool = Optional[None]
38
+ verbose: bool = Optional[None]
39
+ assessment_questions: Optional[list[str]] = Optional[None]
40
+ n: Optional[int] = Optional[None]
41
+ truths_extraction_limit: Optional[int] = Optional[None]
42
+ metric: SummarizationMetric = Optional[None]
43
+
44
+ def __init__(
45
+ self,
46
+ threshold: float = 0.5,
47
+ model: str = "gpt-4",
48
+ include_reason: bool = True,
49
+ strict_mode: bool = False,
50
+ async_mode: bool = True,
51
+ verbose: bool = False,
52
+ assessment_questions: Optional[list[str]] = None,
53
+ n: Optional[int] = 5,
54
+ truths_extraction_limit: Optional[int] = None,
55
+ ):
56
+ """Initialize the Summarization Scorer with DeepEval's Summarization Metric.
57
+
58
+ Args:
59
+ threshold (float): Minimum passing threshold, defaults to 0.5.
60
+ model (str): LLM model for scoring, defaults to "gpt-4".
61
+ include_reason (bool): Include reason for the evaluation score, defaults to True.
62
+ strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
63
+ async_mode (bool): Use asynchronous scoring, defaults to True.
64
+ verbose (bool): Print intermediate steps used for scoring, defaults to False.
65
+ assessment_questions (Optional[list[str]]): a list of close-ended questions that can be answered with either
66
+ a 'yes' or a 'no'. These are questions you want your summary to be able to ideally answer, and is
67
+ especially helpful if you already know what a good summary for your use case looks like. If
68
+ `assessment_questions` is not provided, the metric will generate a set of `assessment_questions` at
69
+ evaluation time.
70
+ n (Optional[int]): The number of assessment questions to generate when `assessment_questions` is not
71
+ provided. Defaults to 5.
72
+ truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
73
+ from the retrieval_context. Defaults to None.
74
+ """
75
+ super().__init__(
76
+ threshold=threshold,
77
+ model=model,
78
+ include_reason=include_reason,
79
+ strict_mode=strict_mode,
80
+ async_mode=async_mode,
81
+ verbose=verbose,
82
+ assessment_questions=assessment_questions,
83
+ n=n,
84
+ truths_extraction_limit=truths_extraction_limit,
85
+ )
86
+
87
+ self.threshold = threshold
88
+ self.model = model
89
+ self.include_reason = include_reason
90
+ self.strict_mode = strict_mode
91
+ self.async_mode = async_mode
92
+ self.verbose = verbose
93
+ self.assessment_questions = assessment_questions
94
+ self.n = n
95
+ self.truths_extraction_limit = truths_extraction_limit
96
+
97
+ self.metric = SummarizationMetric(
98
+ threshold=self.threshold,
99
+ model=self.model,
100
+ include_reason=self.include_reason,
101
+ async_mode=self.async_mode,
102
+ strict_mode=self.strict_mode,
103
+ verbose_mode=self.verbose,
104
+ assessment_questions=self.assessment_questions,
105
+ n=self.n,
106
+ truths_extraction_limit=self.truths_extraction_limit,
107
+ )
108
+
109
+ @weave.op
110
+ def score(
111
+ self,
112
+ input: str,
113
+ actual_output: str,
114
+ expected_output: Optional[str] = None,
115
+ retrieval_context: Optional[list[str]] = None,
116
+ context: Optional[list[str]] = None,
117
+ ) -> dict[str, Union[str, float]]:
118
+ """Evaluate the quality of summarization of an LLM generated response.
119
+
120
+ The Summarization score is calculated according to the following equation:
121
+
122
+ Summarization = min(Alignment Score, Coverage Score)
123
+
124
+ where,
125
+ - Alignment Score: determines whether the summary contains hallucinated or contradictory information to the original text.
126
+ - Coverage Score: determines whether the summary contains the neccessary information from the original text.
127
+
128
+
129
+ While the Alignment Score is similar to that of the Hallucination Score, the Coverage Score is first calculated
130
+ by generating n closed-ended questions that can only be answered with either a 'yes or a 'no', before
131
+ calculating the ratio of which the original text and summary yields the same answer.
132
+
133
+ Args:
134
+ input (str): The input query or prompt that triggered the output.
135
+ actual_output (str): The LLM generated response to evaluate.
136
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
137
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
138
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
139
+
140
+ Returns:
141
+ dict[str, Union[str, float]]: A dictionary containing:
142
+ - "score" (float): The computed summarization score.
143
+ - "reason" (str): A detailed explanation for the assigned score.
144
+ """
145
+ test_case = LLMTestCase(
146
+ input=input,
147
+ actual_output=actual_output,
148
+ expected_output=expected_output,
149
+ retrieval_context=retrieval_context,
150
+ context=context,
151
+ )
152
+
153
+ result: dict[str, Union[str, float]] = {}
154
+
155
+ self.metric.measure(test_case)
156
+ result = {"score": self.metric.score, "reason": self.metric.reason}
157
+
158
+ return result
src/rag_pipelines/evaluation/retrieval/__init__.py ADDED
File without changes
src/rag_pipelines/evaluation/retrieval/contextual_precision.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import numpy as np
4
+ import weave
5
+ from deepeval.metrics import ContextualPrecisionMetric
6
+ from deepeval.test_case import LLMTestCase
7
+ from weave import Scorer
8
+
9
+
10
+ class ContextualPrecisionScorer(Scorer):
11
+ """Evaluate the contextual precision of the generated output with the provided context.
12
+
13
+ This scorer uses DeepEval's `Contextual Precision` Metric to assess how well the generated output
14
+ aligns with the reference context.
15
+
16
+ The contextual precision metric measures the quality of the pipeline's retriever by evaluating whether results in the `retrieval_context` that are relevant to the given input are ranked higher than irrelevant ones.
17
+
18
+ Attributes:
19
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
20
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
21
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
22
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
23
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
24
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
25
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
26
+ to False.
27
+ metric (ContextualPrecisionMetric): The DeepEval ContextualPrecisionMetric.
28
+ """
29
+
30
+ threshold: float = Optional[None]
31
+ model: str = Optional[None]
32
+ include_reason: bool = Optional[None]
33
+ strict_mode: bool = Optional[None]
34
+ async_mode: bool = Optional[None]
35
+ verbose: bool = Optional[None]
36
+ metric: ContextualPrecisionMetric = Optional[None]
37
+
38
+ def __init__(
39
+ self,
40
+ threshold: float = 0.5,
41
+ model: str = "gpt-4",
42
+ include_reason: bool = True,
43
+ strict_mode: bool = True,
44
+ async_mode: bool = True,
45
+ verbose: bool = False,
46
+ ):
47
+ """Initialize the Contextual Precision Scorer using DeepEval's Contextual Precision Metric.
48
+
49
+ Args:
50
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
51
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
52
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
53
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
54
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
55
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
56
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
57
+ to False.
58
+ """
59
+ super().__init__(
60
+ threshold=threshold,
61
+ model=model,
62
+ include_reason=include_reason,
63
+ strict_mode=strict_mode,
64
+ async_mode=async_mode,
65
+ verbose=verbose,
66
+ )
67
+
68
+ self.threshold = threshold
69
+ self.model = model
70
+ self.include_reason = include_reason
71
+ self.strict_mode = strict_mode
72
+ self.async_mode = async_mode
73
+ self.verbose = verbose
74
+
75
+ self.metric = ContextualPrecisionMetric(
76
+ threshold=self.threshold,
77
+ model=self.model,
78
+ include_reason=self.include_reason,
79
+ async_mode=self.async_mode,
80
+ strict_mode=self.strict_mode,
81
+ verbose_mode=self.verbose,
82
+ )
83
+
84
+ @weave.op
85
+ def score(
86
+ self,
87
+ input: str,
88
+ output: Optional[dict] = None,
89
+ expected_output: Optional[str] = None,
90
+ context: Optional[list[str]] = None,
91
+ ) -> dict[str, Union[str, float]]:
92
+ """Evaluate the contextual precision of the generated output with the provided context.
93
+
94
+ The Contextual Precision Score is calculated according to the following equation:
95
+
96
+ Contextual Precision = (1 / Number of Relevant Results) * (Sum(Number of Relevant Results up to position k) / k) * Binary Relevance of k'th result)
97
+
98
+ where,
99
+ - k: The position of the result in the list of all results.
100
+
101
+ The Contextual Precision Scorer first uses an LLM to determine for each result in the `retrieval_context`
102
+ whether it is relevant to the input based on information in the `expected_output`, before calculating the
103
+ weighted cumulative precision as the contextual precision score.
104
+
105
+ Args:
106
+ input (str): The input query or prompt that triggered the output.
107
+ output (str): The LLM generated response to evaluate.
108
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
109
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
110
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
111
+
112
+ Returns:
113
+ dict[str, Union[str, float]]: A dictionary containing:
114
+ - "score" (float): The computed contextual precision score.
115
+ """
116
+ test_case = LLMTestCase(
117
+ input=input,
118
+ actual_output=output.get("output", ""),
119
+ expected_output=expected_output,
120
+ retrieval_context=output.get("retrieval_context", [""]),
121
+ context=context,
122
+ )
123
+
124
+ result: dict[str, Union[str, float]] = {}
125
+
126
+ self.metric.measure(test_case)
127
+ result = {
128
+ "score": self.metric.score,
129
+ }
130
+
131
+ return result
132
+
133
+ @weave.op()
134
+ def summarize(self, score_rows: list) -> dict:
135
+ """Summarize the results of the Contextual Precision Scorer.
136
+
137
+ Args:
138
+ score_rows (list): A list of dictionaries containing the following keys:
139
+ - "score" (float): The computed answer relevancy score.
140
+ - "reason" (str): A detailed explanation for the assigned score.
141
+
142
+ Returns:
143
+ dict: A dictionary containing the following keys:
144
+ - "answer_relevancy_score" (dict): A dictionary containing the following keys:
145
+ - "score" (float): The average answer relevancy score.
146
+ - "variance" (float): The variance of the answer relevancy scores.
147
+ - "std" (float): The standard deviation of the answer relevancy scores.
148
+ - "count" (int): The number of answer relevancy scores.
149
+ """
150
+ scores = []
151
+ for row in score_rows:
152
+ score = row.get("score", 0.0)
153
+ scores.append(float(score))
154
+
155
+ score = np.mean(scores).item()
156
+ variance = np.var(scores).item()
157
+ std = np.std(scores).item()
158
+ count = len(scores)
159
+
160
+ return {"contextual_precision_score": {"score": score, "variance": variance, "std": std, "count": count}}
src/rag_pipelines/evaluation/retrieval/contextual_recall.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import weave
4
+ from deepeval.metrics import ContextualRecallMetric
5
+ from deepeval.test_case import LLMTestCase
6
+ from weave import Scorer
7
+
8
+
9
+ class ContextualRecallScorer(Scorer):
10
+ """Evaluate the contextual recall of the generated output with the provided context.
11
+
12
+ This scorer uses DeepEval's `ContextualRecall` Metric to assess how well the generated output
13
+ aligns with the reference context.
14
+
15
+ The contextual recall metric measures the quality of the pipeline's retriever by evaluating the extent of which the `retrieval_context` aligns with the `expected_output`.
16
+
17
+ Attributes:
18
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
19
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
20
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
21
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
22
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
23
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
24
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
25
+ to False.
26
+ metric (ContextualRecallMetric): The DeepEval ContextualRecallMetric.
27
+ """
28
+
29
+ threshold: float = Optional[None]
30
+ model: str = Optional[None]
31
+ include_reason: bool = Optional[None]
32
+ strict_mode: bool = Optional[None]
33
+ async_mode: bool = Optional[None]
34
+ verbose: bool = Optional[None]
35
+ metric: ContextualRecallMetric = Optional[None]
36
+
37
+ def __init__(
38
+ self,
39
+ threshold: float = 0.5,
40
+ model: str = "gpt-4",
41
+ include_reason: bool = True,
42
+ strict_mode: bool = True,
43
+ async_mode: bool = True,
44
+ verbose: bool = False,
45
+ ):
46
+ """Initialize the Contextual Recall Scorer using DeepEval's Contextual Recall Metric.
47
+
48
+ Args:
49
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
50
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
51
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
52
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
53
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
54
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
55
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
56
+ to False.
57
+ """
58
+ super().__init__(
59
+ threshold=threshold,
60
+ model=model,
61
+ include_reason=include_reason,
62
+ strict_mode=strict_mode,
63
+ async_mode=async_mode,
64
+ verbose=verbose,
65
+ )
66
+
67
+ self.threshold = threshold
68
+ self.model = model
69
+ self.include_reason = include_reason
70
+ self.strict_mode = strict_mode
71
+ self.async_mode = async_mode
72
+ self.verbose = verbose
73
+
74
+ self.metric = ContextualRecallMetric(
75
+ threshold=self.threshold,
76
+ model=self.model,
77
+ include_reason=self.include_reason,
78
+ async_mode=self.async_mode,
79
+ strict_mode=self.strict_mode,
80
+ verbose_mode=self.verbose,
81
+ )
82
+
83
+ @weave.op
84
+ def score(
85
+ self,
86
+ input: str,
87
+ actual_output: str,
88
+ expected_output: Optional[str] = None,
89
+ retrieval_context: Optional[list[str]] = None,
90
+ context: Optional[list[str]] = None,
91
+ ) -> dict[str, Union[str, float]]:
92
+ """Evaluate the contextual recall of the generated output with the provided context.
93
+
94
+ The Contextual Recall Score is calculated according to the following equation:
95
+
96
+ Contextual Recall = Number of Attributable Results / Total Number of Results
97
+
98
+ he Contextual Recall Scorer first uses an LLM to extract all statements made in the `expected_output`, before using the same LLM to classify whether each statement can be attributed to results in the `retrieval_context`.
99
+
100
+ A higher contextual recall score represents a greater ability of the retrieval system to capture all relevant information from the total available relevant set within your knowledge base.
101
+
102
+ Args:
103
+ input (str): The input query or prompt that triggered the output.
104
+ actual_output (str): The LLM generated response to evaluate.
105
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
106
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
107
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
108
+
109
+ Returns:
110
+ dict[str, Union[str, float]]: A dictionary containing:
111
+ - "score" (float): The computed contextual recall score.
112
+ - "reason" (str): A detailed explanation for the assigned score.
113
+ """
114
+ test_case = LLMTestCase(
115
+ input=input,
116
+ actual_output=actual_output,
117
+ expected_output=expected_output,
118
+ retrieval_context=retrieval_context,
119
+ context=context,
120
+ )
121
+
122
+ result: dict[str, Union[str, float]] = {}
123
+
124
+ self.metric.measure(test_case)
125
+ result = {"score": self.metric.score, "reason": self.metric.reason}
126
+
127
+ return result
src/rag_pipelines/evaluation/retrieval/contextual_relevancy.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import weave
4
+ from deepeval.metrics import ContextualRelevancyMetric
5
+ from deepeval.test_case import LLMTestCase
6
+ from weave import Scorer
7
+
8
+
9
+ class ContextualRelevancyScorer(Scorer):
10
+ """Evaluate the contextual relevancy of the generated output with the provided context.
11
+
12
+ This scorer uses DeepEval's `ContextualRelevancy` Metric to assess how well the generated output
13
+ aligns with the reference context.
14
+
15
+ The contextual relevancy metric measures the quality of the RAG pipeline's retriever by evaluating the overall relevance of the information presented in the `retrieval_context` for a given input.
16
+
17
+ Attributes:
18
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
19
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
20
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
21
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
22
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
23
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
24
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
25
+ to False.
26
+ metric (ContextualRelevancyMetric): The DeepEval ContextualRelevancyMetric.
27
+ """
28
+
29
+ threshold: float = Optional[None]
30
+ model: str = Optional[None]
31
+ include_reason: bool = Optional[None]
32
+ strict_mode: bool = Optional[None]
33
+ async_mode: bool = Optional[None]
34
+ verbose: bool = Optional[None]
35
+ metric: ContextualRelevancyMetric = Optional[None]
36
+
37
+ def __init__(
38
+ self,
39
+ threshold: float = 0.5,
40
+ model: str = "gpt-4",
41
+ include_reason: bool = True,
42
+ strict_mode: bool = True,
43
+ async_mode: bool = True,
44
+ verbose: bool = False,
45
+ ):
46
+ """Initialize the Contextual Relevancy Scorer using DeepEval's Contextual Relevancy Metric.
47
+
48
+ Args:
49
+ threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
50
+ model (str): The LLM model to use for scoring, defaults to "gpt-4".
51
+ include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
52
+ strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
53
+ 0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
54
+ async_mode (bool): Whether to use asynchronous scoring, defaults to True.
55
+ verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
56
+ to False.
57
+ """
58
+ super().__init__(
59
+ threshold=threshold,
60
+ model=model,
61
+ include_reason=include_reason,
62
+ strict_mode=strict_mode,
63
+ async_mode=async_mode,
64
+ verbose=verbose,
65
+ )
66
+
67
+ self.threshold = threshold
68
+ self.model = model
69
+ self.include_reason = include_reason
70
+ self.strict_mode = strict_mode
71
+ self.async_mode = async_mode
72
+ self.verbose = verbose
73
+
74
+ self.metric = ContextualRelevancyMetric(
75
+ threshold=self.threshold,
76
+ model=self.model,
77
+ include_reason=self.include_reason,
78
+ async_mode=self.async_mode,
79
+ strict_mode=self.strict_mode,
80
+ verbose_mode=self.verbose,
81
+ )
82
+
83
+ @weave.op
84
+ def score(
85
+ self,
86
+ input: str,
87
+ actual_output: str,
88
+ expected_output: Optional[str] = None,
89
+ retrieval_context: Optional[list[str]] = None,
90
+ context: Optional[list[str]] = None,
91
+ ) -> dict[str, Union[str, float]]:
92
+ """Evaluate the contextual relevancy of the generated output with the provided context.
93
+
94
+ The Contextual Relevancy Score is calculated according to the following equation:
95
+
96
+ Contextual Relevancy = Number of Relevant Results / Total Number of Results
97
+
98
+ Although similar to how the Answer Relevancy Score is calculated, the Contextual Relevancy Metric first uses an LLM to extract all statements made in the `retrieval_context` instead, before using the same LLM to classify whether each statement is relevant to the input.
99
+
100
+ Args:
101
+ input (str): The input query or prompt that triggered the output.
102
+ actual_output (str): The LLM generated response to evaluate.
103
+ expected_output (Optional[str]): The expected or reference output, defaults to None.
104
+ retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
105
+ context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
106
+
107
+ Returns:
108
+ dict[str, Union[str, float]]: A dictionary containing:
109
+ - "score" (float): The computed contextual relevancy score.
110
+ - "reason" (str): A detailed explanation for the assigned score.
111
+ """
112
+ test_case = LLMTestCase(
113
+ input=input,
114
+ actual_output=actual_output,
115
+ expected_output=expected_output,
116
+ retrieval_context=retrieval_context,
117
+ context=context,
118
+ )
119
+
120
+ result: dict[str, Union[str, float]] = {}
121
+
122
+ self.metric.measure(test_case)
123
+ result = {"score": self.metric.score, "reason": self.metric.reason}
124
+
125
+ return result
src/rag_pipelines/llms/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from rag_pipelines.llms.groq import ChatGroqGenerator
2
+
3
+ __all__ = ["ChatGroqGenerator"]
src/rag_pipelines/llms/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (256 Bytes). View file
 
src/rag_pipelines/llms/__pycache__/groq.cpython-310.pyc ADDED
Binary file (3.63 kB). View file
 
src/rag_pipelines/llms/groq.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Optional
3
+
4
+ import weave
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_groq import ChatGroq
7
+ from pydantic import BaseModel
8
+
9
+ from rag_pipelines.prompts import STRUCTURED_RAG_PROMPT, RAGResponseModel
10
+
11
+
12
+ class ChatGroqGenerator:
13
+ """Interact with the ChatGroq model to generate responses based on user queries and documents.
14
+
15
+ This class provides an interface for generating responses using the ChatGroq model.
16
+ It handles prompt formatting, LLM invocation, document integration, and result generation.
17
+ """
18
+
19
+ model: str
20
+ api_key: str
21
+ llm_params: dict[str, Any]
22
+ llm: Optional[ChatGroq] = None
23
+ structured_output_model: BaseModel
24
+ system_prompt: str
25
+
26
+ def __init__(
27
+ self,
28
+ model: str,
29
+ api_key: Optional[str] = None,
30
+ llm_params: Optional[dict[str, Any]] = None,
31
+ structured_output_model: BaseModel = RAGResponseModel,
32
+ system_prompt: str = STRUCTURED_RAG_PROMPT,
33
+ ):
34
+ """Initialize the ChatGroqGenerator with configuration parameters.
35
+
36
+ Args:
37
+ model (str): The name of the ChatGroq model to use.
38
+ api_key (Optional[str]): API key for the ChatGroq service. If not provided,
39
+ the "GROQ_API_KEY" environment variable will be used.
40
+ llm_params (Optional[dict]): Additional parameters for configuring the ChatGroq model.
41
+ structured_output_model (BaseModel): The output model for structured responses.
42
+ system_prompt (str): The system prompt for the ChatGroq model.
43
+
44
+ Raises:
45
+ ValueError: If the API key is not provided and the "GROQ_API_KEY" environment variable is not set.
46
+ """
47
+ if llm_params is None:
48
+ llm_params = {}
49
+
50
+ api_key = api_key or os.environ.get("GROQ_API_KEY")
51
+ if api_key is None:
52
+ msg = "GROQ_API_KEY is not set. Please provide an API key or set it as an environment variable."
53
+ raise ValueError(msg)
54
+
55
+ self.model = model
56
+ self.api_key = api_key
57
+ self.llm_params = llm_params
58
+
59
+ self.structured_output_model = structured_output_model
60
+ self.system_prompt = system_prompt
61
+
62
+ self.llm = ChatGroq(model=self.model, api_key=self.api_key, **llm_params)
63
+
64
+ @weave.op()
65
+ def __call__(self, state: dict[str, Any]) -> dict[str, Any]:
66
+ """Generate a response using the current state of user prompts and graded documents.
67
+
68
+ Args:
69
+ state (dict[str, Any]): The current state, containing:
70
+ - 'question': The user question.
71
+ - 'context': A list of filtered document texts.
72
+ - 'documents': A list of retrieved documents.
73
+
74
+ Returns:
75
+ dict[str, Any]: A dictionary containing:
76
+ - 'question': The user question.
77
+ - 'context': A list of filtered document texts.
78
+ - 'documents': A list of retrieved documents.
79
+ - 'answer': The generated response.
80
+ """
81
+ question = state["question"]
82
+ context = state["context"]
83
+ documents = state["documents"]
84
+
85
+ formatted_context = "\n".join(context)
86
+
87
+ prompt = ChatPromptTemplate.from_messages(
88
+ [
89
+ ("system", self.system_prompt),
90
+ ]
91
+ )
92
+
93
+ response_chain = prompt | self.llm.with_structured_output(self.structured_output_model)
94
+
95
+ response = response_chain.invoke({"question": question, "context": formatted_context})
96
+
97
+ answer = response.final_answer
98
+
99
+ return {"question": question, "context": context, "documents": documents, "answer": answer}
src/rag_pipelines/pipelines/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from rag_pipelines.pipelines.self_rag import SelfRAGPipeline
2
+
3
+ __all__ = ["SelfRAGPipeline"]
src/rag_pipelines/pipelines/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (268 Bytes). View file
 
src/rag_pipelines/pipelines/__pycache__/self_rag.cpython-310.pyc ADDED
Binary file (5.1 kB). View file
 
src/rag_pipelines/pipelines/__pycache__/self_rag_graph_state.cpython-310.pyc ADDED
Binary file (1.09 kB). View file
 
src/rag_pipelines/pipelines/adaptive_rag.py ADDED
File without changes
src/rag_pipelines/pipelines/adaptive_rag_graph_state.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from typing_extensions import TypedDict
3
+
4
+
5
+ class AdaptiveRAGGraphState(TypedDict):
6
+ """Represents the state of the graph for the Adaptive Retrieval-Augmentation-Generation (Adaptive-RAG) pipeline.
7
+
8
+ Attributes:
9
+ question (str): The input question for the pipeline.
10
+ answer (str): The generated response from the LLM.
11
+ documents (list[Document]): A list of LangChain documents that are retrieved and processed through the pipeline.
12
+ context (list[str]): The final list of context documents passed to the LLM for generating the answer.
13
+ """
14
+
15
+ question: str
16
+ answer: str
17
+ documents: list[Document]
18
+ context: list[str]
src/rag_pipelines/pipelines/crag.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Optional
3
+
4
+ import weave
5
+ from langchain_community.retrievers import PineconeHybridSearchRetriever
6
+ from langchain_core.prompts.chat import ChatPromptTemplate
7
+ from langgraph.graph import END, START, StateGraph
8
+ from langgraph.graph.state import CompiledStateGraph
9
+ from weave.integrations.langchain import WeaveTracer
10
+
11
+ from rag_pipelines.llms.groq import ChatGroqGenerator
12
+ from rag_pipelines.pipelines.crag_graph_state import CRAGGraphState
13
+ from rag_pipelines.query_transformer import QueryTransformer
14
+ from rag_pipelines.retrieval_evaluator import DocumentGrader, QueryDecisionMaker
15
+ from rag_pipelines.websearch import WebSearch
16
+
17
+ # Disable global tracing explicitly
18
+ os.environ["WEAVE_TRACE_LANGCHAIN"] = "false"
19
+
20
+
21
+ class CorrectiveRAGPipeline(weave.Model):
22
+ """A corrective retrieval-augmented generation (RAG) pipeline using Weave for tracing and LangChain components.
23
+
24
+ This pipeline integrates document retrieval, relevance evaluation, grading, query transformation, web search,
25
+ and LLM-based response generation to implement a corrective RAG system. It utilizes Weave for tracing execution
26
+ details and LangChain components for processing.
27
+
28
+ Attributes:
29
+ retriever (Optional[PineconeHybridSearchRetriever]): The retrieval model used to fetch relevant documents based on a query.
30
+ prompt (Optional[ChatPromptTemplate]): The prompt template to generate questions for the LLM.
31
+ generator (Optional[ChatGroqGenerator]): The language model used to generate responses.
32
+ grader (Optional[DocumentGrader]): Grades documents based on evaluation results.
33
+ query_transformer (Optional[QueryTransformer]): Transforms user queries to optimize retrieval.
34
+ web_search (Optional[WebSearch]): Performs web search for additional context.
35
+ tracing_project_name (str): The name of the Weave project for tracing.
36
+ weave_params (Dict[str, Any]): Parameters for initializing Weave.
37
+ tracer (Optional[WeaveTracer]): The tracer used to record execution details with Weave.
38
+ """
39
+
40
+ retriever: Optional[PineconeHybridSearchRetriever] = None
41
+ prompt: Optional[ChatPromptTemplate] = None
42
+ generator: Optional[ChatGroqGenerator] = None
43
+ grader: Optional[DocumentGrader] = None
44
+ query_transformer: Optional[QueryTransformer] = None
45
+ web_search: Optional[WebSearch] = None
46
+ tracing_project_name: str
47
+ weave_params: dict[str, Any]
48
+ tracer: Optional[WeaveTracer] = None
49
+
50
+ def __init__(
51
+ self,
52
+ retriever: PineconeHybridSearchRetriever,
53
+ prompt: ChatPromptTemplate,
54
+ generator: ChatGroqGenerator,
55
+ grader: DocumentGrader,
56
+ query_transformer: QueryTransformer,
57
+ web_search: WebSearch,
58
+ tracing_project_name: str = "corrective_rag",
59
+ weave_params: Optional[dict[str, Any]] = None,
60
+ ):
61
+ """Initialize the CorrectiveRAGPipeline.
62
+
63
+ Args:
64
+ retriever (PineconeHybridSearchRetriever): The retrieval model used to fetch documents for the RAG pipeline.
65
+ prompt (ChatPromptTemplate): The prompt template used to create questions for the LLM.
66
+ generator (ChatGroqGenerator): The language model used for response generation.
67
+ grader (DocumentGrader): Component to grade the relevance of evaluated documents.
68
+ query_transformer (QueryTransformer): Component to transform the user query.
69
+ web_search (WebSearch): Component to perform web search for additional context.
70
+ tracing_project_name (str): The name of the Weave project for tracing. Defaults to "corrective_rag".
71
+ weave_params (Dict[str, Any]): Additional parameters for initializing Weave.
72
+ """
73
+ if weave_params is None:
74
+ weave_params = {}
75
+
76
+ super().__init__(
77
+ retriever=retriever,
78
+ prompt=prompt,
79
+ generator=generator,
80
+ grader=grader,
81
+ query_transformer=query_transformer,
82
+ web_search=web_search,
83
+ tracing_project_name=tracing_project_name,
84
+ weave_params=weave_params,
85
+ )
86
+
87
+ self.retriever = retriever
88
+ self.prompt = prompt
89
+ self.generator = generator
90
+ self.grader = grader
91
+ self.query_transformer = query_transformer
92
+ self.web_search = web_search
93
+ self.tracing_project_name = tracing_project_name
94
+ self.weave_params = weave_params
95
+
96
+ self._initialize_weave(**weave_params)
97
+
98
+ def _initialize_weave(self, **weave_params) -> None:
99
+ """Initialize Weave with the specified tracing project name.
100
+
101
+ Sets up the Weave environment and creates a tracer for monitoring pipeline execution.
102
+
103
+ Args:
104
+ weave_params (Dict[str, Any]): Additional parameters for configuring Weave.
105
+ """
106
+ weave.init(self.tracing_project_name, **weave_params)
107
+ self.tracer = WeaveTracer()
108
+
109
+ def _build_crag_graph(self) -> CompiledStateGraph:
110
+ """Build and compile the corrective RAG workflow graph.
111
+
112
+ The graph defines the flow between components like retrieval, grading, query transformation,
113
+ web search, and generation.
114
+
115
+ Returns:
116
+ CompiledStateGraph: The compiled state graph representing the corrective RAG pipeline workflow.
117
+ """
118
+ crag_workflow = StateGraph(CRAGGraphState)
119
+
120
+ # Define the nodes
121
+ crag_workflow.add_node("retrieve", self.retriever)
122
+ crag_workflow.add_node("grade_documents", self.grader)
123
+ crag_workflow.add_node("generate", self.generator)
124
+ crag_workflow.add_node("transform_query", self.query_transformer)
125
+ crag_workflow.add_node("web_search_node", self.web_search)
126
+
127
+ # Define edges between nodes
128
+ crag_workflow.add_edge(START, "retrieve")
129
+ crag_workflow.add_edge("retrieve", "grade_documents")
130
+ crag_workflow.add_conditional_edges(
131
+ "grade_documents",
132
+ QueryDecisionMaker(),
133
+ {
134
+ "transform_query": "transform_query",
135
+ "generate": "generate",
136
+ },
137
+ )
138
+ crag_workflow.add_edge("transform_query", "web_search_node")
139
+ crag_workflow.add_edge("web_search_node", "generate")
140
+ crag_workflow.add_edge("generate", END)
141
+
142
+ # Compile the graph
143
+ crag_pipeline = crag_workflow.compile()
144
+
145
+ return crag_pipeline
146
+
147
+ @weave.op()
148
+ def predict(self, question: str) -> str:
149
+ """Execute the corrective RAG pipeline with the given question.
150
+
151
+ The pipeline retrieves documents, evaluates and grades their relevance, and generates a final response
152
+ using the LLM.
153
+
154
+ Args:
155
+ question (str): The input question to be answered.
156
+
157
+ Returns:
158
+ str: The final answer generated by the LLM.
159
+
160
+ Example:
161
+ ```python
162
+ pipeline = CorrectiveRAGPipeline(...)
163
+ answer = pipeline.predict("What are the latest AI trends?")
164
+ print(answer)
165
+ ```
166
+ """
167
+ config = {"callbacks": [self.tracer]}
168
+
169
+ crag_graph = self._build_crag_graph()
170
+ response = crag_graph.invoke(question, config=config)
171
+
172
+ return response
src/rag_pipelines/pipelines/crag_graph_state.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing_extensions import TypedDict
2
+
3
+
4
+ class CRAGGraphState(TypedDict):
5
+ """Represents the state of the graph for the Corrective Retrieval-Augmentation-Generation (CRAG) pipeline.
6
+
7
+ Attributes:
8
+ question (str): The input question for the pipeline.
9
+ generation (str): The generated response from the LLM.
10
+ web_search (str): Indicates whether a web search is required (e.g., "yes" or "no").
11
+ documents (List[str]): A list of relevant documents retrieved or processed.
12
+ """
13
+
14
+ question: str
15
+ generation: str
16
+ web_search: str
17
+ documents: list[str]
src/rag_pipelines/pipelines/dspy/dspy_baseline_rag.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ from datasets import load_dataset
5
+ from dspy_modules.evaluator import DSPyEvaluator
6
+ from dspy_modules.rag import DSPyRAG
7
+ from dspy_modules.weaviate_db import WeaviateVectorDB
8
+
9
+
10
+ def main(cluster_url, api_key, index_name, model_name, llm_model, llm_api_key):
11
+ # Load dataset
12
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
13
+ questions = earnings_calls_data["question"]
14
+
15
+ # Split into datasets
16
+ [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
17
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
18
+ [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
19
+
20
+ # Initialize Weaviate VectorDB
21
+ weaviate_db = WeaviateVectorDB(cluster_url, api_key, index_name, model_name)
22
+
23
+ # Initialize LLM
24
+ llm = dspy.LM(llm_model, api_key=llm_api_key, num_retries=120)
25
+ dspy.configure(lm=llm)
26
+
27
+ # Initialize RAG
28
+ rag = DSPyRAG(weaviate_db)
29
+
30
+ # Evaluate before compilation
31
+ evaluator = DSPyEvaluator()
32
+ evaluate = dspy.Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
33
+ evaluate(rag, metric=evaluator.llm_metric)
34
+
35
+
36
+ if __name__ == "__main__":
37
+ parser = argparse.ArgumentParser(description="Run DSPy-based RAG pipeline")
38
+ parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL")
39
+ parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key")
40
+ parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name")
41
+ parser.add_argument("--model_name", type=str, required=True, help="Embedding model name")
42
+ parser.add_argument("--llm_model", type=str, required=True, help="LLM model name")
43
+ parser.add_argument("--llm_api_key", type=str, required=True, help="LLM API key")
44
+
45
+ args = parser.parse_args()
46
+ main(args.cluster_url, args.api_key, args.index_name, args.model_name, args.llm_model, args.llm_api_key)
src/rag_pipelines/pipelines/dspy/dspy_bayesian_signature_optimization_rag.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ import weaviate
5
+ from datasets import load_dataset
6
+ from dspy.evaluate.evaluate import Evaluate
7
+ from dspy.primitives.prediction import Prediction
8
+ from dspy.teleprompt import BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
9
+ from langchain_huggingface import HuggingFaceEmbeddings
10
+ from langchain_weaviate.vectorstores import WeaviateVectorStore
11
+ from weaviate.classes.init import Auth
12
+
13
+ # Argument Parser
14
+ parser = argparse.ArgumentParser(description="RAG Optimization with DSPy")
15
+ parser.add_argument(
16
+ "--optimizer",
17
+ type=str,
18
+ choices=["bootstrap", "bayesian"],
19
+ default="bootstrap",
20
+ help="Choose the optimization method",
21
+ )
22
+ args = parser.parse_args()
23
+
24
+ # Load dataset
25
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
26
+ questions = earnings_calls_data["question"]
27
+
28
+ # Create DSPy datasets
29
+ trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
30
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
31
+ testset = [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
32
+
33
+ # Embeddings and Weaviate client
34
+ embeddings = HuggingFaceEmbeddings(
35
+ model_name="jinaai/jina-embeddings-v3",
36
+ model_kwargs={"device": "cpu", "trust_remote_code": True},
37
+ encode_kwargs={"task": "retrieval.query", "prompt_name": "retrieval.query"},
38
+ )
39
+
40
+ weaviate_client = weaviate.connect_to_weaviate_cloud(
41
+ cluster_url="https://adrrwus9shkxkuijvazcrq.c0.us-west3.gcp.weaviate.cloud",
42
+ auth_credentials=Auth.api_key("J94gHySMWTWxggDDayGrF2ESGo23yOHZ1bUC"),
43
+ )
44
+ weaviate_db = WeaviateVectorStore(
45
+ index_name="LangChain_d73ad6159d514fec887456fa6db11e61",
46
+ embedding=embeddings,
47
+ client=weaviate_client,
48
+ text_key="text",
49
+ )
50
+
51
+ # Configure LLM
52
+ llm = dspy.LM(
53
+ "groq/llama-3.3-70b-versatile",
54
+ api_key="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",
55
+ num_retries=120,
56
+ )
57
+ dspy.configure(lm=llm)
58
+
59
+
60
+ # Define DSPy Module
61
+ class GenerateAnswer(dspy.Signature):
62
+ context = dspy.InputField(desc="may contain relevant facts")
63
+ question = dspy.InputField()
64
+ answer = dspy.OutputField(desc="short and precise answer")
65
+
66
+
67
+ class RAG(dspy.Module):
68
+ def __init__(self):
69
+ super().__init__()
70
+ self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
71
+
72
+ def retrieve(self, question):
73
+ results = weaviate_db.similarity_search(query=question)
74
+ passages = [res.page_content for res in results]
75
+ return Prediction(passages=passages)
76
+
77
+ def forward(self, question):
78
+ context = self.retrieve(question).passages
79
+ prediction = self.generate_answer(context=context, question=question)
80
+ return dspy.Prediction(context=context, answer=prediction.answer)
81
+
82
+
83
+ # Define LLM Metric
84
+ def llm_metric(gold, pred, trace=None):
85
+ predicted_answer = pred.answer
86
+ context = pred.context
87
+ detail = dspy.ChainOfThought(GenerateAnswer)(
88
+ context="N/A", assessed_question="Is the answer detailed?", assessed_answer=predicted_answer
89
+ )
90
+ faithful = dspy.ChainOfThought(GenerateAnswer)(
91
+ context=context, assessed_question="Is it grounded in context?", assessed_answer=predicted_answer
92
+ )
93
+ overall = dspy.ChainOfThought(GenerateAnswer)(
94
+ context=context, assessed_question=f"Rate the answer: {predicted_answer}", assessed_answer=predicted_answer
95
+ )
96
+ total = float(detail.answer) + float(faithful.answer) * 2 + float(overall.answer)
97
+ return total / 5.0
98
+
99
+
100
+ # Evaluate before optimization
101
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
102
+ evaluate(RAG(), metric=llm_metric)
103
+
104
+ # Select Optimizer
105
+ if args.optimizer == "bootstrap":
106
+ optimizer = BootstrapFewShotWithRandomSearch(
107
+ metric=llm_metric,
108
+ max_bootstrapped_demos=4,
109
+ max_labeled_demos=4,
110
+ max_rounds=1,
111
+ num_candidate_programs=2,
112
+ num_threads=2,
113
+ )
114
+ else:
115
+ optimizer = BayesianSignatureOptimizer(
116
+ task_model=dspy.settings.lm, metric=llm_metric, prompt_model=dspy.settings.lm, n=5, verbose=False
117
+ )
118
+
119
+ # Compile optimized RAG
120
+ optimized_compiled_rag = optimizer.compile(RAG(), trainset=trainset)
121
+
122
+ # Evaluate optimized RAG
123
+ evaluate = Evaluate(metric=llm_metric, devset=devset, num_threads=1, display_progress=True, display_table=5)
124
+ evaluate(optimized_compiled_rag)
src/rag_pipelines/pipelines/dspy/dspy_bootstrap_few_shot_optimization_rag.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ from datasets import load_dataset
5
+ from dspy.evaluate.evaluate import Evaluate
6
+ from dspy.teleprompt import BootstrapFewShot
7
+ from dspy_modules.evaluator import llm_metric
8
+ from dspy_modules.rag import RAG
9
+ from dspy_modules.weaviate_db import WeaviateVectorDB
10
+
11
+
12
+ def main(args):
13
+ # Load dataset
14
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
15
+ questions = earnings_calls_data["question"]
16
+
17
+ # Split dataset
18
+ trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
19
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
20
+ [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
21
+
22
+ # Initialize Weaviate VectorDB
23
+ weaviate_db = WeaviateVectorDB(
24
+ cluster_url=args.cluster_url, api_key=args.api_key, index_name=args.index_name, model_name=args.embedding_model
25
+ )
26
+
27
+ # Initialize LLM
28
+ llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
29
+ dspy.configure(lm=llm)
30
+
31
+ # Initialize and evaluate unoptimized RAG
32
+ uncompiled_rag = RAG(weaviate_db)
33
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
34
+ evaluate(uncompiled_rag, metric=llm_metric)
35
+
36
+ # Optimize RAG using BootstrapFewShot
37
+ optimizer = BootstrapFewShot(metric=llm_metric)
38
+ optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
39
+
40
+ # Evaluate optimized RAG
41
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
42
+ evaluate(optimized_compiled_rag)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
47
+
48
+ # Weaviate parameters
49
+ parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL")
50
+ parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key")
51
+ parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name")
52
+ parser.add_argument("--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model name")
53
+
54
+ # LLM parameters
55
+ parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name")
56
+ parser.add_argument("--llm_api_key", type=str, required=True, help="LLM API key")
57
+ parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM calls")
58
+
59
+ args = parser.parse_args()
60
+ main(args)
src/rag_pipelines/pipelines/dspy/dspy_copro_instruction_signature_optimization_rag.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/weaviate/recipes/blob/main/integrations/llm-frameworks/dspy/1.Getting-Started-with-RAG-in-DSPy.ipynb
2
+
3
+ import dspy
4
+ import weaviate
5
+ from datasets import load_dataset
6
+ from dspy.evaluate.evaluate import Evaluate
7
+ from dspy.primitives.prediction import Prediction
8
+ from dspy.teleprompt import COPRO
9
+ from langchain_huggingface import HuggingFaceEmbeddings
10
+ from langchain_weaviate.vectorstores import WeaviateVectorStore
11
+ from weaviate.classes.init import Auth
12
+
13
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
14
+ questions = earnings_calls_data["question"]
15
+
16
+ # Create the dspy datasets
17
+ trainset = questions[:20] # 20 examples for training
18
+ devset = questions[20:30] # 10 examples for development
19
+ testset = questions[30:] # 20 examples for testing
20
+
21
+ trainset = [dspy.Example(question=question).with_inputs("question") for question in trainset]
22
+ devset = [dspy.Example(question=question).with_inputs("question") for question in devset]
23
+ testset = [dspy.Example(question=question).with_inputs("question") for question in testset]
24
+
25
+
26
+ model_name = "jinaai/jina-embeddings-v3"
27
+ task = "retrieval.query"
28
+ model_kwargs = {"device": "cpu", "trust_remote_code": True}
29
+ encode_kwargs = {"task": task, "prompt_name": task}
30
+ embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
31
+
32
+
33
+ weaviate_client = weaviate.connect_to_weaviate_cloud(
34
+ cluster_url="https://adrrwus9shkxkuijvazcrq.c0.us-west3.gcp.weaviate.cloud",
35
+ auth_credentials=Auth.api_key("J94gHySMWTWxggDDayGrF2ESGo23yOHZ1bUC"),
36
+ )
37
+ weaviate_db = WeaviateVectorStore(
38
+ index_name="LangChain_d73ad6159d514fec887456fa6db11e61",
39
+ embedding=embeddings,
40
+ client=weaviate_client,
41
+ text_key="text",
42
+ )
43
+
44
+
45
+ llm = dspy.LM(
46
+ "groq/llama-3.3-70b-versatile",
47
+ api_key="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",
48
+ num_retries=120,
49
+ )
50
+ dspy.configure(lm=llm)
51
+
52
+
53
+ class GenerateAnswer(dspy.Signature):
54
+ """Answer questions with short factoid answers."""
55
+
56
+ context = dspy.InputField(desc="may contain relevant facts")
57
+ question = dspy.InputField()
58
+ answer = dspy.OutputField(desc="short and precise answer")
59
+
60
+
61
+ class RAG(dspy.Module):
62
+ def __init__(self):
63
+ super().__init__()
64
+ self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
65
+
66
+ # This makes it possible to use the Langchain VectorDB integration and custom embeddings with SentenceTransformers
67
+ def retrieve(self, question):
68
+ results = weaviate_db.similarity_search(query=question)
69
+ passages = [res.page_content for res in results]
70
+ return Prediction(passages=passages)
71
+
72
+ def forward(self, question):
73
+ context = self.retrieve(question).passages
74
+ prediction = self.generate_answer(context=context, question=question)
75
+ return dspy.Prediction(context=context, answer=prediction.answer)
76
+
77
+
78
+ # Create an LLM as a Judge Evaluation Metric for evaluation of the RAG Pipelines
79
+ # (Taken from weaviate recipe)
80
+
81
+
82
+ class Assess(dspy.Signature):
83
+ """Assess the quality of an answer to a question."""
84
+
85
+ context = dspy.InputField(desc="The context for answering the question.")
86
+ assessed_question = dspy.InputField(desc="The evaluation criterion.")
87
+ assessed_answer = dspy.InputField(desc="The answer to the question.")
88
+ assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")
89
+
90
+
91
+ def llm_metric(gold, pred, trace=None):
92
+ predicted_answer = pred.answer
93
+ context = pred.context
94
+ question = gold.question
95
+
96
+ print(f"Test Question: {question}")
97
+ print(f"Predicted Answer: {predicted_answer}")
98
+
99
+ detail = "Is the assessed answer detailed?"
100
+ faithful = (
101
+ "Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."
102
+ )
103
+ overall = f"Please rate how well this answer answers the question, `{question}` based on the context.\n `{predicted_answer}`"
104
+
105
+ detail = dspy.ChainOfThought(Assess)(context="N/A", assessed_question=detail, assessed_answer=predicted_answer)
106
+ faithful = dspy.ChainOfThought(Assess)(
107
+ context=context, assessed_question=faithful, assessed_answer=predicted_answer
108
+ )
109
+ overall = dspy.ChainOfThought(Assess)(context=context, assessed_question=overall, assessed_answer=predicted_answer)
110
+
111
+ print(f"Faithful: {faithful.assessment_answer}")
112
+ print(f"Detail: {detail.assessment_answer}")
113
+ print(f"Overall: {overall.assessment_answer}")
114
+
115
+ total = float(detail.assessment_answer) + float(faithful.assessment_answer) * 2 + float(overall.assessment_answer)
116
+
117
+ return total / 5.0
118
+
119
+
120
+ # Evaluate our RAG Program before it is compiled
121
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
122
+ evaluate(RAG(), metric=llm_metric)
123
+
124
+
125
+ # Optimize the RAG Program
126
+ optimizer = COPRO(
127
+ prompt_model=dspy.settings.lm,
128
+ metric=llm_metric,
129
+ breadth=3,
130
+ depth=2,
131
+ init_temperature=0.25,
132
+ verbose=False,
133
+ )
134
+
135
+
136
+ optimized_compiled_rag = optimizer.compile(
137
+ RAG(),
138
+ trainset=trainset,
139
+ eval_kwargs={"num_threads": 1, "display_progress": True, "display_table": 0},
140
+ )
141
+
142
+ # Evaluate the optimized RAG Program
143
+ evaluate = Evaluate(
144
+ metric=llm_metric,
145
+ devset=devset,
146
+ num_threads=1,
147
+ display_progress=True,
148
+ display_table=5,
149
+ )
150
+ evaluate(optimized_compiled_rag)
src/rag_pipelines/pipelines/dspy_baseline_rag.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ from datasets import load_dataset
5
+
6
+ from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
7
+ from rag_pipelines.dspy.dspy_rag import DSPyRAG
8
+ from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
9
+
10
+
11
+ def main(cluster_url, api_key, index_name, model_name, llm_model, llm_api_key):
12
+ """Executes the DSPy-based Retrieval-Augmented Generation (RAG) pipeline.
13
+
14
+ This function:
15
+ 1. Loads a dataset of earnings call Q&A pairs.
16
+ 2. Prepares development (dev) and test datasets for evaluation.
17
+ 3. Initializes a Weaviate vector database for storing and retrieving embeddings.
18
+ 4. Configures a Large Language Model (LLM) with DSPy.
19
+ 5. Instantiates and evaluates the RAG pipeline before optimization.
20
+
21
+ Args:
22
+ cluster_url (str): URL of the Weaviate vector database cluster.
23
+ api_key (str): API key for authenticating access to Weaviate.
24
+ index_name (str): Name of the Weaviate index for storing vectors.
25
+ model_name (str): Embedding model name for vectorization.
26
+ llm_model (str): Name of the LLM used for inference.
27
+ llm_api_key (str): API key for accessing the LLM.
28
+ """
29
+ # Load the earnings calls Q&A dataset (first 50 samples)
30
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
31
+ questions = earnings_calls_data["question"]
32
+
33
+ # Prepare dataset splits:
34
+ # - The first 20 questions are used for training (not explicitly utilized here).
35
+ # - The next 10 questions are used as the development set (devset) for evaluation.
36
+ # - The remaining questions are part of the test set (not used in this script).
37
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
38
+
39
+ # Initialize Weaviate VectorDB for embedding storage and retrieval
40
+ weaviate_db = WeaviateVectorDB(
41
+ cluster_url=cluster_url, # Weaviate cluster URL
42
+ api_key=api_key, # API key for authentication
43
+ index_name=index_name, # Name of the index for vector storage
44
+ model_name=model_name, # Embedding model used for vectorization
45
+ )
46
+
47
+ # Initialize the LLM with DSPy
48
+ llm = dspy.LM(llm_model, api_key=llm_api_key, num_retries=120)
49
+ dspy.configure(lm=llm) # Set DSPy’s global LLM configuration
50
+
51
+ # Instantiate the RAG pipeline
52
+ rag = DSPyRAG(weaviate_db)
53
+
54
+ # Initialize evaluator for measuring LLM-based retrieval performance
55
+ evaluator = DSPyEvaluator()
56
+
57
+ # Evaluate the RAG pipeline before optimization
58
+ evaluate = dspy.Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
59
+ evaluate(rag, metric=evaluator.llm_metric)
60
+
61
+
62
+ if __name__ == "__main__":
63
+ """
64
+ Parses command-line arguments and runs the DSPy-based RAG pipeline.
65
+ """
66
+
67
+ parser = argparse.ArgumentParser(description="Run DSPy-based RAG pipeline")
68
+
69
+ # Weaviate configuration parameters
70
+ parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
71
+ parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
72
+ parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
73
+ parser.add_argument("--model_name", type=str, required=True, help="Embedding model name for vectorization.")
74
+
75
+ # LLM configuration parameters
76
+ parser.add_argument("--llm_model", type=str, required=True, help="LLM model name.")
77
+ parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM access.")
78
+
79
+ # Parse command-line arguments and execute the pipeline
80
+ args = parser.parse_args()
81
+ main(args.cluster_url, args.api_key, args.index_name, args.model_name, args.llm_model, args.llm_api_key)
src/rag_pipelines/pipelines/dspy_bayesian_signature_optimization_rag.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ import weaviate
5
+ from datasets import load_dataset
6
+ from dspy.evaluate.evaluate import Evaluate
7
+ from dspy.teleprompt import BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from weaviate.classes.init import Auth
10
+
11
+ from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
12
+ from rag_pipelines.dspy.dspy_rag import DSPyRAG
13
+ from rag_pipelines.vectordb.weaviate import WeaviateVectorStore
14
+
15
+
16
+ def parse_args():
17
+ """Parse command-line arguments."""
18
+ parser = argparse.ArgumentParser(description="Optimize and evaluate RAG pipeline with DSPy.")
19
+
20
+ # Dataset Arguments
21
+ parser.add_argument(
22
+ "--dataset_name", type=str, default="lamini/earnings-calls-qa", help="Name of the dataset to use."
23
+ )
24
+ parser.add_argument("--dataset_size", type=int, default=50, help="Number of examples to load from the dataset.")
25
+
26
+ # Weaviate Configuration
27
+ parser.add_argument("--weaviate_url", type=str, required=True, help="Weaviate cloud cluster URL.")
28
+ parser.add_argument("--weaviate_api_key", type=str, required=True, help="API key for Weaviate.")
29
+ parser.add_argument("--index_name", type=str, required=True, help="Index name in Weaviate.")
30
+ parser.add_argument(
31
+ "--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model for Weaviate."
32
+ )
33
+
34
+ # LLM Configuration
35
+ parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model to use.")
36
+ parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM.")
37
+
38
+ # Optimization Method
39
+ parser.add_argument(
40
+ "--optimizer",
41
+ type=str,
42
+ choices=["bootstrap", "bayesian"],
43
+ default="bootstrap",
44
+ help="Choose the optimization method.",
45
+ )
46
+
47
+ return parser.parse_args()
48
+
49
+
50
+ def main():
51
+ args = parse_args()
52
+
53
+ # Load dataset
54
+ dataset = load_dataset(args.dataset_name, split=f"train[:{args.dataset_size}]")
55
+ questions = dataset["question"]
56
+
57
+ # Create DSPy datasets
58
+ trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
59
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
60
+ testset = [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
61
+
62
+ # Initialize embeddings
63
+ model_kwargs = {"device": "cpu", "trust_remote_code": True}
64
+ encode_kwargs = {"task": "retrieval.query", "prompt_name": "retrieval.query"}
65
+ embeddings = HuggingFaceEmbeddings(
66
+ model_name=args.embedding_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
67
+ )
68
+
69
+ # Connect to Weaviate
70
+ weaviate_client = weaviate.connect_to_weaviate_cloud(
71
+ cluster_url=args.weaviate_url,
72
+ auth_credentials=Auth.api_key(args.weaviate_api_key),
73
+ )
74
+ WeaviateVectorStore(
75
+ index_name=args.index_name,
76
+ embedding=embeddings,
77
+ client=weaviate_client,
78
+ text_key="text",
79
+ )
80
+
81
+ # Configure LLM
82
+ llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=120)
83
+ dspy.configure(lm=llm)
84
+
85
+ # Evaluate before optimization
86
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
87
+ evaluate(DSPyRAG(), metric=DSPyEvaluator.llm_metric())
88
+
89
+ # Select Optimizer
90
+ if args.optimizer == "bootstrap":
91
+ optimizer = BootstrapFewShotWithRandomSearch(
92
+ metric=DSPyEvaluator.llm_metric(),
93
+ max_bootstrapped_demos=4,
94
+ max_labeled_demos=4,
95
+ max_rounds=1,
96
+ num_candidate_programs=2,
97
+ num_threads=2,
98
+ )
99
+ else:
100
+ optimizer = BayesianSignatureOptimizer(
101
+ task_model=dspy.settings.lm,
102
+ metric=DSPyEvaluator.llm_metric(),
103
+ prompt_model=dspy.settings.lm,
104
+ n=5,
105
+ verbose=False,
106
+ )
107
+
108
+ # Compile optimized RAG
109
+ optimized_compiled_rag = optimizer.compile(DSPyRAG(), testset=testset, trainset=trainset)
110
+
111
+ # Evaluate optimized RAG
112
+ evaluate = Evaluate(
113
+ metric=DSPyEvaluator.llm_metric(), devset=devset, num_threads=1, display_progress=True, display_table=5
114
+ )
115
+ evaluate(optimized_compiled_rag)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_optimization_rag.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ from datasets import load_dataset
5
+ from dspy.evaluate.evaluate import Evaluate
6
+ from dspy.teleprompt import BootstrapFewShot
7
+
8
+ from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
9
+ from rag_pipelines.dspy.dspy_rag import DSPyRAG
10
+ from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
11
+
12
+
13
+ def main(args):
14
+ """Runs the DSPy RAG optimization pipeline.
15
+
16
+ This function:
17
+ 1. Loads the earnings calls dataset.
18
+ 2. Splits the dataset into training, development, and test sets.
19
+ 3. Initializes a Weaviate vector database and an LLM.
20
+ 4. Evaluates an unoptimized RAG pipeline.
21
+ 5. Optimizes the RAG pipeline using BootstrapFewShot.
22
+ 6. Evaluates the optimized RAG pipeline.
23
+
24
+ Args:
25
+ args (argparse.Namespace): Command-line arguments for configuring the pipeline.
26
+ """
27
+ # Load the dataset (Earnings Calls QA dataset)
28
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
29
+ questions = earnings_calls_data["question"]
30
+
31
+ # Split the dataset into training (20), development (10), and test sets
32
+ trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
33
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
34
+ [dspy.Example(question=q).with_inputs("question") for q in questions[30:]] # Test set (not used in this script)
35
+
36
+ # Initialize Weaviate VectorDB for storing and retrieving embeddings
37
+ weaviate_db = WeaviateVectorDB(
38
+ cluster_url=args.cluster_url, # URL of the Weaviate cluster
39
+ api_key=args.api_key, # API key for authentication
40
+ index_name=args.index_name, # Name of the Weaviate index
41
+ model_name=args.embedding_model, # Embedding model to use for vector storage
42
+ )
43
+
44
+ # Initialize LLM with DSPy
45
+ llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
46
+ dspy.configure(lm=llm) # Set DSPy's global configuration for LLM usage
47
+
48
+ # Initialize the unoptimized RAG pipeline
49
+ uncompiled_rag = DSPyRAG(weaviate_db)
50
+
51
+ # Evaluate the unoptimized RAG pipeline using the development set
52
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
53
+ evaluate(uncompiled_rag, metric=DSPyEvaluator.llm_metric())
54
+
55
+ # Optimize the RAG pipeline using BootstrapFewShot
56
+ optimizer = BootstrapFewShot(metric=DSPyEvaluator.llm_metric())
57
+
58
+ # Compile an optimized version of the RAG model using the training set
59
+ optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
60
+
61
+ # Evaluate the optimized RAG pipeline
62
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
63
+ evaluate(optimized_compiled_rag)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ """
68
+ Parses command-line arguments and runs the main function.
69
+ """
70
+
71
+ parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
72
+
73
+ # Weaviate parameters (for vector storage and retrieval)
74
+ parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
75
+ parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
76
+ parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
77
+ parser.add_argument(
78
+ "--embedding_model",
79
+ type=str,
80
+ default="jinaai/jina-embeddings-v3",
81
+ help="Embedding model used for document vectorization.",
82
+ )
83
+
84
+ # LLM parameters (for DSPy-based language model inference)
85
+ parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name.")
86
+ parser.add_argument("--llm_api_key", type=str, required=True, help="API key for accessing the LLM service.")
87
+ parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM API calls.")
88
+
89
+ # Parse command-line arguments and execute the pipeline
90
+ args = parser.parse_args()
91
+ main(args)
src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_random_search_optimization_rag.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ import dspy
5
+ from datasets import load_dataset
6
+ from dspy.evaluate.evaluate import Evaluate
7
+ from dspy.teleprompt import BootstrapFewShotWithRandomSearch
8
+
9
+ from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
10
+ from rag_pipelines.dspy.dspy_rag import DSPyRAG
11
+ from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
12
+
13
+
14
+ def main(args):
15
+ """Main function to run the DSPy RAG optimization pipeline.
16
+
17
+ This function loads a dataset, initializes a Weaviate vector database and an LLM,
18
+ evaluates an unoptimized RAG pipeline, optimizes it using BootstrapFewShotWithRandomSearch,
19
+ and then evaluates the optimized pipeline.
20
+
21
+ Args:
22
+ args (argparse.Namespace): Command-line arguments for configuring the pipeline.
23
+ """
24
+ # Load dataset (Earnings Calls QA)
25
+ earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
26
+ questions = earnings_calls_data["question"]
27
+
28
+ # Split dataset into training, development, and test sets
29
+ trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
30
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
31
+ [dspy.Example(question=q).with_inputs("question") for q in questions[30:]] # Test set (not used here)
32
+
33
+ # Initialize Weaviate Vector Database
34
+ weaviate_db = WeaviateVectorDB(
35
+ cluster_url=args.cluster_url,
36
+ api_key=args.api_key,
37
+ index_name=args.index_name,
38
+ model_name=args.embedding_model,
39
+ )
40
+
41
+ # Initialize the LLM
42
+ llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
43
+ dspy.configure(lm=llm) # Set DSPy's global LLM configuration
44
+
45
+ # Initialize the unoptimized RAG pipeline
46
+ uncompiled_rag = DSPyRAG(weaviate_db)
47
+
48
+ # Evaluate the unoptimized RAG model
49
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
50
+ evaluate(uncompiled_rag, metric=DSPyEvaluator.llm_metric())
51
+
52
+ # Optimize RAG using BootstrapFewShotWithRandomSearch
53
+ optimizer = BootstrapFewShotWithRandomSearch(
54
+ metric=DSPyEvaluator.llm_metric(),
55
+ max_bootstrapped_demos=args.max_bootstrapped_demos,
56
+ max_labeled_demos=args.max_labeled_demos,
57
+ max_rounds=args.max_rounds,
58
+ num_candidate_programs=args.num_candidate_programs,
59
+ num_threads=args.num_threads,
60
+ num_threads=args.num_threads,
61
+ )
62
+
63
+ # Compile an optimized version of the RAG model
64
+ optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
65
+
66
+ # Evaluate the optimized RAG model
67
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
68
+ evaluate(optimized_compiled_rag)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ """
73
+ Parses command-line arguments and runs the main function.
74
+ """
75
+
76
+ parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
77
+
78
+ # Weaviate parameters
79
+ parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
80
+ parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
81
+ parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
82
+ parser.add_argument(
83
+ "--embedding_model",
84
+ type=str,
85
+ default="jinaai/jina-embeddings-v3",
86
+ help="Embedding model to use for vector retrieval.",
87
+ )
88
+
89
+ # LLM parameters
90
+ parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name.")
91
+ parser.add_argument("--llm_api_key", type=str, required=True, help="API key for accessing the LLM.")
92
+ parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM calls.")
93
+
94
+ # Optimization parameters
95
+ parser.add_argument("--max_bootstrapped_demos", type=int, default=4, help="Max bootstrapped demonstrations.")
96
+ parser.add_argument("--max_labeled_demos", type=int, default=4, help="Max labeled demonstrations.")
97
+ parser.add_argument("--max_rounds", type=int, default=1, help="Max optimization rounds.")
98
+ parser.add_argument("--num_candidate_programs", type=int, default=2, help="Number of candidate programs.")
99
+ parser.add_argument("--num_threads", type=int, default=2, help="Number of threads for optimization.")
100
+
101
+ # Parse arguments and run the main function
102
+ args = parser.parse_args()
103
+ main(args)
src/rag_pipelines/pipelines/dspy_copro_instruction_signature_optimization_rag.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import dspy
4
+ import weaviate
5
+ from datasets import load_dataset
6
+ from dspy.evaluate.evaluate import Evaluate
7
+ from dspy.teleprompt import COPRO
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from weaviate.classes.init import Auth
10
+
11
+ from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
12
+ from rag_pipelines.dspy.dspy_rag import DSPyRAG
13
+ from rag_pipelines.vectordb.weaviate import WeaviateVectorStore
14
+
15
+
16
+ def parse_args():
17
+ """Parse command-line arguments for the DSPy RAG pipeline with Weaviate and LLM evaluation.
18
+
19
+ Returns:
20
+ argparse.Namespace: The parsed command-line arguments.
21
+ """
22
+ parser = argparse.ArgumentParser(description="Run DSPy RAG pipeline with Weaviate and LLM evaluation.")
23
+
24
+ # Dataset Arguments
25
+ parser.add_argument(
26
+ "--dataset_name", type=str, default="lamini/earnings-calls-qa", help="Name of the dataset to use."
27
+ )
28
+ parser.add_argument("--dataset_size", type=int, default=50, help="Number of examples to load from the dataset.")
29
+
30
+ # Weaviate Configuration
31
+ parser.add_argument("--weaviate_url", type=str, required=True, help="Weaviate cloud cluster URL.")
32
+ parser.add_argument("--weaviate_api_key", type=str, required=True, help="API key for Weaviate.")
33
+ parser.add_argument("--index_name", type=str, required=True, help="Index name in Weaviate.")
34
+ parser.add_argument(
35
+ "--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model for Weaviate."
36
+ )
37
+
38
+ # LLM Configuration
39
+ parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model to use.")
40
+ parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM.")
41
+
42
+ return parser.parse_args()
43
+
44
+
45
+ def main():
46
+ """Run the DSPy RAG pipeline, including dataset loading, embedding initialization, Weaviate connection, LLM evaluation, and model optimization.
47
+
48
+ This function orchestrates the entire pipeline from loading data, preparing datasets,
49
+ connecting to Weaviate, initializing embeddings, evaluating the model, and optimizing the RAG pipeline.
50
+ """
51
+ # Parse command-line arguments
52
+ args = parse_args()
53
+
54
+ # Load dataset from Hugging Face and extract questions
55
+ dataset = load_dataset(args.dataset_name, split=f"train[:{args.dataset_size}]")
56
+ questions = dataset["question"]
57
+
58
+ # Create DSPy datasets for training and evaluation
59
+ trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
60
+ devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
61
+ [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
62
+
63
+ # Initialize HuggingFace embeddings for retrieval tasks
64
+ model_kwargs = {"device": "cpu", "trust_remote_code": True}
65
+ encode_kwargs = {"task": "retrieval.query", "prompt_name": "retrieval.query"}
66
+ embeddings = HuggingFaceEmbeddings(
67
+ model_name=args.embedding_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
68
+ )
69
+
70
+ # Connect to Weaviate using the provided URL and API key
71
+ weaviate_client = weaviate.connect_to_weaviate_cloud(
72
+ cluster_url=args.weaviate_url,
73
+ auth_credentials=Auth.api_key(args.weaviate_api_key),
74
+ )
75
+
76
+ # Initialize Weaviate vector store with the embeddings and client connection
77
+ WeaviateVectorStore(
78
+ index_name=args.index_name,
79
+ embedding=embeddings,
80
+ client=weaviate_client,
81
+ text_key="text",
82
+ )
83
+
84
+ # Initialize the LLM (Language Model) with the specified model and API key
85
+ llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=120)
86
+ dspy.configure(lm=llm)
87
+
88
+ # Evaluate the initial RAG pipeline
89
+ evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
90
+ evaluate(DSPyRAG(), metric=DSPyEvaluator.llm_metric())
91
+
92
+ # Optimize the RAG model using COPRO (Collaborative Prompt Optimization)
93
+ optimizer = COPRO(
94
+ prompt_model=dspy.settings.lm,
95
+ metric=DSPyEvaluator.llm_metric(),
96
+ breadth=3,
97
+ depth=2,
98
+ init_temperature=0.25,
99
+ verbose=False,
100
+ )
101
+
102
+ # Compile the optimized RAG model with the training set
103
+ optimized_compiled_rag = optimizer.compile(
104
+ DSPyRAG(),
105
+ trainset=trainset,
106
+ eval_kwargs={"num_threads": 1, "display_progress": True, "display_table": 0},
107
+ )
108
+
109
+ # Evaluate the optimized model on the development set
110
+ evaluate = Evaluate(
111
+ metric=DSPyEvaluator.llm_metric(),
112
+ devset=devset,
113
+ num_threads=1,
114
+ display_progress=True,
115
+ display_table=5,
116
+ )
117
+ evaluate(optimized_compiled_rag)
118
+
119
+
120
+ if __name__ == "__main__":
121
+ main()
src/rag_pipelines/pipelines/dspy_rag.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import dspy
4
+ import weave
5
+ from dspy import LM, Module
6
+
7
+
8
+ class DSPyRAGPipeline(weave.Model):
9
+ """A class representing a Retrieval-Augmented Generation (RAG) model using DSPy.
10
+
11
+ Attributes:
12
+ llm (LM): The language model used for generating predictions.
13
+ rag_module (Module): The module used for retrieval tasks.
14
+ """
15
+
16
+ llm: LM
17
+ rag_module: Module
18
+
19
+ def __init__(self, llm: LM, rag_module: Module) -> None:
20
+ """Initialize the DSPyRAG model.
21
+
22
+ Args:
23
+ llm (LM): The language model to be used.
24
+ rag_module (Module): The module to be used for retrieval tasks.
25
+ """
26
+ super().__init__(llm=llm, rag_module=rag_module)
27
+
28
+ self.llm = llm
29
+ self.rag_module = rag_module
30
+
31
+ dspy.configure(lm=llm)
32
+
33
+ @weave.op()
34
+ def predict(self, input: str) -> dict[str, Any]:
35
+ """Predicts the answer to a given question using the RAG approach.
36
+
37
+ Args:
38
+ input (str): The question to be answered.
39
+
40
+ Returns:
41
+ Dict[str, Any]: A dictionary containing the answer and the context.
42
+ - "answer" (str): The predicted answer to the question.
43
+ - "context" (Any): The context used by the RAG module.
44
+ """
45
+ prediction = self.rag_module(input)
46
+
47
+ return {"output": prediction.answer, "retrieval_context": prediction.retrieval_context}
src/rag_pipelines/pipelines/dspy_rag_module.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from dspy import ChainOfThought, Module, Prediction
4
+
5
+ from rag_pipelines.evaluation import retrieval
6
+ from rag_pipelines.prompts import GenerateAnswerFromContext
7
+
8
+
9
+ class RAG(Module):
10
+ """Retrieval-Augmented Generation (RAG) module that retrieves context based on a question and generates an answer using that context."""
11
+
12
+ def __init__(self, retriever: Any):
13
+ """Initialize the RAG module.
14
+
15
+ Args:
16
+ retriever (Any): An object that has a `question` method returning
17
+ a `passages` attribute. Typically, this would be a retriever like
18
+ a Milvus Retriever.
19
+ """
20
+ super().__init__()
21
+ self.retrieve = retriever
22
+ self.generate_answer = ChainOfThought(GenerateAnswerFromContext)
23
+
24
+ def forward(self, question: str) -> Prediction:
25
+ """Process a question by retrieving context and generating an answer.
26
+
27
+ Args:
28
+ question (str): The question to be answered.
29
+
30
+ Returns:
31
+ Prediction: A Prediction object containing the context and the generated answer.
32
+ """
33
+ # Use the retriever to get context for the question.
34
+ context = self.retrieve(question).passages
35
+ # Generate an answer using the retrieved context and the question.
36
+ prediction = self.generate_answer(context=context, question=question)
37
+ # Return a Prediction object with the context and answer.
38
+
39
+ return Prediction(retrieval_context=[item.long_text for item in context], answer=prediction.answer)
src/rag_pipelines/pipelines/rag.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Optional
3
+
4
+ import weave
5
+ from langchain_community.retrievers import PineconeHybridSearchRetriever
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.prompts.chat import ChatPromptTemplate
8
+ from langchain_core.runnables import RunnablePassthrough
9
+ from langchain_groq import ChatGroq
10
+ from weave import Model
11
+ from weave.integrations.langchain import WeaveTracer
12
+
13
+ # Disable global tracing explicitly
14
+ os.environ["WEAVE_TRACE_LANGCHAIN"] = "false"
15
+
16
+
17
+ class RAGPipeline(Model):
18
+ """A hybrid retrieval-augmented generation (RAG) pipeline using Weave for tracing and LangChain components.
19
+
20
+ This pipeline integrates a retriever, prompt template, and language model (LLM) to implement a retrieval-augmented
21
+ generation system, where the LLM generates answers based on both retrieved documents and a prompt template.
22
+ Weave is used for tracing to monitor the pipeline's execution.
23
+
24
+ Attributes:
25
+ retriever: The retrieval model used to fetch relevant documents based on a query.
26
+ prompt: The prompt template to generate questions for the LLM.
27
+ llm: The language model used to generate responses.
28
+ tracer: The tracer used to record the execution details with Weave.
29
+ tracing_project_name: The name of the Weave project for tracing.
30
+ """
31
+
32
+ retriever: Optional[PineconeHybridSearchRetriever] = None
33
+ prompt: Optional[ChatPromptTemplate] = None
34
+ llm: Optional[ChatGroq] = None
35
+ tracing_project_name: str
36
+ weave_params: dict[str, Any]
37
+ tracer: Optional[WeaveTracer] = None
38
+
39
+ def __init__(self, retriever, prompt, llm, tracing_project_name="hybrid_rag", weave_params=None):
40
+ """Initialize the HybridRAGPipeline.
41
+
42
+ This constructor sets up the retriever, prompt, LLM, and integrates Weave tracing if specified.
43
+
44
+ Args:
45
+ retriever: The retrieval model used to fetch documents for the RAG pipeline.
46
+ prompt: The prompt template used to create questions for the LLM.
47
+ llm: The language model used for response generation based on retrieved documents and prompt.
48
+ tracing_project_name (str): The name of the Weave project for tracing. Defaults to "hybrid_rag".
49
+ weave_params (dict): Additional parameters for initializing Weave. This can include configuration
50
+ details or authentication settings for the Weave service.
51
+ """
52
+ super().__init__(
53
+ retriever=retriever,
54
+ prompt=prompt,
55
+ llm=llm,
56
+ tracing_project_name=tracing_project_name,
57
+ weave_params=weave_params,
58
+ )
59
+
60
+ if weave_params is None:
61
+ weave_params = {}
62
+
63
+ self.retriever = retriever
64
+ self.prompt = prompt
65
+ self.llm = llm
66
+ self.tracing_project_name = tracing_project_name
67
+
68
+ # Initialize Weave tracing if parameters are provided, otherwise default initialization.
69
+ if weave_params:
70
+ self._initialize_weave(**weave_params)
71
+ else:
72
+ self._initialize_weave()
73
+
74
+ def _initialize_weave(self, **weave_params):
75
+ """Initialize Weave with the specified tracing project name.
76
+
77
+ This method sets up the Weave environment and creates an instance of the WeaveTracer.
78
+ The tracer records the execution of each step in the RAG pipeline for monitoring and debugging purposes.
79
+ """
80
+ # Initialize the Weave project
81
+ weave.init(self.tracing_project_name, **weave_params)
82
+ # Set up the tracer for tracking pipeline execution
83
+ self.tracer = WeaveTracer()
84
+
85
+ @weave.op()
86
+ def predict(self, question: str) -> str:
87
+ """Execute the Hybrid RAG pipeline with the given question.
88
+
89
+ This method orchestrates the entire RAG pipeline. It first retrieves documents using the retriever,
90
+ formats them, generates a question using the prompt template, and then processes the final response
91
+ using the LLM. The process is traced using Weave for debugging and monitoring.
92
+
93
+ Args:
94
+ question (str): The input question to be answered by the pipeline.
95
+
96
+ Returns:
97
+ str: The answer generated by the LLM based on the retrieved documents and the question prompt.
98
+ """
99
+ # Configuration for trace callbacks to record the execution process
100
+ config = {"callbacks": [self.tracer]}
101
+
102
+ # Set up the RAG pipeline chain with document retrieval, formatting, prompting, LLM, and output parsing
103
+ rag_chain = (
104
+ {
105
+ "context": self.retriever | self.format_docs,
106
+ "question": RunnablePassthrough(),
107
+ }
108
+ | self.prompt
109
+ | self.llm
110
+ | StrOutputParser()
111
+ )
112
+
113
+ # Invoke the pipeline with the specified question and configuration
114
+ return rag_chain.invoke(question, config=config)
115
+
116
+ def format_docs(self, docs):
117
+ """Format retrieved documents into a string for input to the LLM.
118
+
119
+ The documents are formatted with information such as filing date, accession number, summary, and image
120
+ descriptions.
121
+ This string will be passed as the context for the LLM to generate a response.
122
+
123
+ Args:
124
+ docs (list): A list of document objects that have been retrieved based on the input question.
125
+
126
+ Returns:
127
+ str: A formatted string of document contents, joined by newline characters.
128
+ """
129
+ context = ""
130
+ for doc in docs:
131
+ date = doc.metadata["filing_date"]
132
+ accession_no = doc.metadata["accession_no"]
133
+ summary = doc.metadata["summary"]
134
+ image_descriptions = doc.metadata["image_descriptions"]
135
+ context += (
136
+ f"""# Report {accession_no} filed on {date}:\n\n## An excerpt from the report"""
137
+ f"""\n\n{doc.page_content}\n\n"""
138
+ )
139
+ if len(image_descriptions) > 0:
140
+ context += f"""## Image descriptions\n\n{image_descriptions}\n\n"""
141
+ context += (
142
+ f"""## Summary of the report\n\nHere's a summary of the report along with the some """
143
+ f"""important keywords and phrases present in the report:\n\n{summary}\n\n"""
144
+ )
145
+
146
+ return context