awinml commited on Mar 15, 2025

Commit

336f4a9

verified ·

1 Parent(s): a29687e

Upload 107 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +118 -0
pyproject.toml +202 -0
src/rag_pipelines/__init__.py +0 -0
src/rag_pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
src/rag_pipelines/embeddings/__init__.py +6 -0
src/rag_pipelines/embeddings/__pycache__/__init__.cpython-310.pyc +0 -0
src/rag_pipelines/embeddings/__pycache__/dense.cpython-310.pyc +0 -0
src/rag_pipelines/embeddings/__pycache__/sparse_fastembed_qdrant.cpython-310.pyc +0 -0
src/rag_pipelines/embeddings/__pycache__/sparse_milvus.cpython-310.pyc +0 -0
src/rag_pipelines/embeddings/__pycache__/sparse_pinecone_text.cpython-310.pyc +0 -0
src/rag_pipelines/embeddings/dense.py +85 -0
src/rag_pipelines/embeddings/sparse_fastembed_qdrant.py +57 -0
src/rag_pipelines/embeddings/sparse_milvus.py +67 -0
src/rag_pipelines/embeddings/sparse_pinecone_text.py +58 -0
src/rag_pipelines/evaluation/__init__.py +19 -0
src/rag_pipelines/evaluation/evaluator.py +54 -0
src/rag_pipelines/evaluation/response/__init__.py +0 -0
src/rag_pipelines/evaluation/response/answer_relevancy.py +152 -0
src/rag_pipelines/evaluation/response/faithfulness.py +132 -0
src/rag_pipelines/evaluation/response/hallucination.py +127 -0
src/rag_pipelines/evaluation/response/phoenix_hallucination.py +107 -0
src/rag_pipelines/evaluation/response/summarization.py +158 -0
src/rag_pipelines/evaluation/retrieval/__init__.py +0 -0
src/rag_pipelines/evaluation/retrieval/contextual_precision.py +160 -0
src/rag_pipelines/evaluation/retrieval/contextual_recall.py +127 -0
src/rag_pipelines/evaluation/retrieval/contextual_relevancy.py +125 -0
src/rag_pipelines/llms/__init__.py +3 -0
src/rag_pipelines/llms/__pycache__/__init__.cpython-310.pyc +0 -0
src/rag_pipelines/llms/__pycache__/groq.cpython-310.pyc +0 -0
src/rag_pipelines/llms/groq.py +99 -0
src/rag_pipelines/pipelines/__init__.py +3 -0
src/rag_pipelines/pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
src/rag_pipelines/pipelines/__pycache__/self_rag.cpython-310.pyc +0 -0
src/rag_pipelines/pipelines/__pycache__/self_rag_graph_state.cpython-310.pyc +0 -0
src/rag_pipelines/pipelines/adaptive_rag.py +0 -0
src/rag_pipelines/pipelines/adaptive_rag_graph_state.py +18 -0
src/rag_pipelines/pipelines/crag.py +172 -0
src/rag_pipelines/pipelines/crag_graph_state.py +17 -0
src/rag_pipelines/pipelines/dspy/dspy_baseline_rag.py +46 -0
src/rag_pipelines/pipelines/dspy/dspy_bayesian_signature_optimization_rag.py +124 -0
src/rag_pipelines/pipelines/dspy/dspy_bootstrap_few_shot_optimization_rag.py +60 -0
src/rag_pipelines/pipelines/dspy/dspy_copro_instruction_signature_optimization_rag.py +150 -0
src/rag_pipelines/pipelines/dspy_baseline_rag.py +81 -0
src/rag_pipelines/pipelines/dspy_bayesian_signature_optimization_rag.py +119 -0
src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_optimization_rag.py +91 -0
src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_random_search_optimization_rag.py +103 -0
src/rag_pipelines/pipelines/dspy_copro_instruction_signature_optimization_rag.py +121 -0
src/rag_pipelines/pipelines/dspy_rag.py +47 -0
src/rag_pipelines/pipelines/dspy_rag_module.py +39 -0
src/rag_pipelines/pipelines/rag.py +146 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,118 @@

+# Local run files
+qa.db
+**/qa.db
+**/*qa*.db
+**/test-reports
+# Byte-compiled / optimized / DLL files
+__pycache__/
+/pycache/*
+**/pycache/*
+*/*/pycache/*
+*/*/*/pycache/*
+*/*/*/*/pycache/*
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pyflow
+__pypackages__/
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# PyCharm
+.idea
+# VSCode
+.vscode
+# http cache (requests-cache)
+**/http_cache.sqlite
+# ruff
+.ruff_cache

pyproject.toml ADDED Viewed

	@@ -0,0 +1,202 @@

+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+[project]
+name = "rag-pipelines"
+version = "0.0.1"
+description = 'Advanced Retrieval Augmented Generation Pipelines'
+readme = "README.md"
+requires-python = ">=3.9"
+license = "MIT"
+keywords = []
+authors = [
+  { name = "Ashwin Mathur", email = "" },
+  { name = "Varun Mathur", email = "" },
+]
+classifiers = [
+  "License :: OSI Approved :: MIT License",
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+  "dataloaders @ git+https://github.com/avnlp/dataloaders.git",
+  "langchain-core",
+  "langgraph",
+  "langchain-text-splitters",
+  "langchain-experimental",
+  "langchain-huggingface",
+  "langchain-groq",
+  "langchain_milvus",
+  "langchain-qdrant",
+  "langchain-pinecone",
+  "langchain-voyageai",
+  "spladerunner",
+  "haystack-ai",
+  "weave",
+  "edgartools",
+  "fastembed",
+  "pinecone-text[splade]",
+  "unstructured[pdf]",
+  "deepeval",
+  "arize-phoenix",
+  "dspy",
+  "dspy-ai[milvus]",
+  "optimum[onnxruntime]",
+]
+[project.optional-dependencies]
+dev = ["pytest"]
+[project.urls]
+Documentation = "https://github.com/avnlp/rag-pipelines#readme"
+Issues = "https://github.com/avnlp/rag-pipelines/issues"
+Source = "https://github.com/avnlp/rag-pipelines"
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+packages = ["src/rag_pipelines"]
+[tool.hatch.envs.default]
+installer = "uv"
+dependencies = [
+  "coverage[toml]>=6.5",
+  "pytest",
+  "pytest-rerunfailures",
+  "pytest-mock",
+]
+[tool.hatch.envs.default.scripts]
+test = "pytest -vv {args:tests}"
+test-cov = "coverage run -m pytest {args:tests}"
+test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x"
+cov-report = ["- coverage combine", "coverage report"]
+cov = ["test-cov", "cov-report"]
+cov-retry = ["test-cov-retry", "cov-report"]
+[[tool.hatch.envs.test.matrix]]
+python = ["39", "310", "311"]
+[tool.hatch.envs.lint]
+installer = "uv"
+detached = true
+dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
+[tool.hatch.envs.lint.scripts]
+typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
+style = ["ruff check {args:}", "black --check --diff {args:.}"]
+fmt = ["black {args:.}", "ruff check --fix --unsafe-fixes {args:}", "style"]
+all = ["style", "typing"]
+[tool.coverage.run]
+source = ["rag_pipelines"]
+branch = true
+parallel = true
+[tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing = true
+exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
+[tool.ruff]
+target-version = "py39"
+line-length = 120
+[tool.ruff.lint]
+select = [
+  "A",
+  "ARG",
+  "B",
+  "C",
+  "D",
+  "D401",
+  "DTZ",
+  "E",
+  "EM",
+  "F",
+  "I",
+  "ICN",
+  "ISC",
+  "N",
+  "PLC",
+  "PLE",
+  "PLR",
+  "PLW",
+  "Q",
+  "RUF",
+  "S",
+  "T",
+  "TID",
+  "UP",
+  "W",
+  "YTT",
+]
+ignore = [
+  # Allow non-abstract empty methods in abstract base classes
+  "B027",
+  # Allow boolean positional values in function calls, like `dict.get(... True)`
+  "FBT003",
+  # Ignore checks for possible passwords
+  "S102",
+  "S105",
+  "S106",
+  "S107",
+  # Ignore complexity
+  "C901",
+  "PLR0911",
+  "PLR0912",
+  "PLR0913",
+  "PLR0915",
+  # Allow print statements
+  "T201",
+  # Ignore missing module docstrings
+  "D100",
+  "D104",
+  # Ignore Line too long
+  "E501",
+  # Ignore builtin argument shadowing
+  "A002",
+  # Ignore builtin module shadowing
+  "A005",
+  # Ignore Function calls in argument defaults
+  "B008",
+  "ARG002",
+  "ARG005",
+]
+unfixable = [
+  # Don't touch unused imports
+  "F401",
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.lint.isort]
+known-first-party = ["rag_pipelines"]
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "parents"
+[tool.ruff.lint.per-file-ignores]
+# Tests can use magic values, assertions, and relative imports
+"tests/**/*" = ["PLR2004", "S101", "TID252"]
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "--strict-markers"
+markers = ["integration: integration tests"]
+log_cli = true
+[tool.black]
+line-length = 120
+[[tool.mypy.overrides]]
+module = ["rag_pipelines.*", "pytest.*", "numpy.*"]
+ignore_missing_imports = true

src/rag_pipelines/__init__.py ADDED Viewed

File without changes

src/rag_pipelines/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (164 Bytes). View file

src/rag_pipelines/embeddings/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from rag_pipelines.embeddings.dense import DenseEmbeddings
+from rag_pipelines.embeddings.sparse_fastembed_qdrant import SparseEmbeddings
+from rag_pipelines.embeddings.sparse_milvus import SparseEmbeddingsMilvus
+from rag_pipelines.embeddings.sparse_pinecone_text import SparseEmbeddingsSplade
+__all__ = ["DenseEmbeddings", "SparseEmbeddings", "SparseEmbeddingsMilvus", "SparseEmbeddingsSplade"]

src/rag_pipelines/embeddings/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (552 Bytes). View file

src/rag_pipelines/embeddings/__pycache__/dense.cpython-310.pyc ADDED Viewed

Binary file (3.36 kB). View file

src/rag_pipelines/embeddings/__pycache__/sparse_fastembed_qdrant.cpython-310.pyc ADDED Viewed

Binary file (2.73 kB). View file

src/rag_pipelines/embeddings/__pycache__/sparse_milvus.cpython-310.pyc ADDED Viewed

Binary file (3.37 kB). View file

src/rag_pipelines/embeddings/__pycache__/sparse_pinecone_text.cpython-310.pyc ADDED Viewed

Binary file (2.61 kB). View file

src/rag_pipelines/embeddings/dense.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from typing import Any, Optional
+import weave
+from langchain_huggingface import HuggingFaceEmbeddings
+class DenseEmbeddings(weave.Model):
+    """Generate dense embeddings for documents and queries using a specified SentenceTransformer model.
+    This class leverages HuggingFace's `HuggingFaceEmbeddings` to compute dense embeddings for input text.
+    Attributes:
+        model_name (str): The name of the pre-trained embedding model to use.
+        model_kwargs (Optional[Dict[str, Any]]): Additional configuration parameters for the embedding model.
+        encode_kwargs (Optional[Dict[str, Any]]): Parameters for fine-tuning the behavior of the encoding process.
+        embedding_model (HuggingFaceEmbeddings): The initialized HuggingFace embeddings model with the specified settings.
+    """
+    model_name: str
+    model_kwargs: Optional[dict[str, Any]]
+    encode_kwargs: Optional[dict[str, Any]]
+    show_progress: bool
+    embedding_model: Optional[HuggingFaceEmbeddings] = None
+    def __init__(
+        self,
+        model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+        model_kwargs: Optional[dict[str, Any]] = None,
+        encode_kwargs: Optional[dict[str, Any]] = None,
+        show_progress: bool = True,
+    ):
+        """Initialize the DenseEmbeddings class with the specified model and configurations.
+        Args:
+            model_name (str): The name of the pre-trained embedding model. Defaults to "sentence-transformers/all-MiniLM-L6-v2".
+            model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
+            encode_kwargs (Optional[dict[str, Any]]): Parameters for encoding settings. Defaults to None.
+            show_progress (bool): Whether to display progress during model operations. Defaults to True.
+        """
+        if encode_kwargs is None:
+            encode_kwargs = {"normalize_embeddings": True}
+        if model_kwargs is None:
+            model_kwargs = {"device": "cpu"}
+        super().__init__(
+            model_name=model_name,
+            model_kwargs=model_kwargs,
+            encode_kwargs=encode_kwargs,
+            show_progress=show_progress,
+        )
+        self.model_name = model_name
+        self.model_kwargs = model_kwargs if model_kwargs is not None else {}
+        self.encode_kwargs = encode_kwargs if encode_kwargs is not None else {}
+        # Initialize the embedding model with the specified parameters
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name=self.model_name,
+            model_kwargs=self.model_kwargs,
+            encode_kwargs=self.encode_kwargs,
+            show_progress=show_progress,
+        )
+    @weave.op()
+    def embed_texts(self, texts: list[str]) -> list[list[float]]:
+        """Embed a list of texts and return their embeddings.
+        Args:
+            texts (list[str]): A list of texts to embed.
+        Returns:
+            list[list[float]]: A list of embedding vectors corresponding to each input text.
+        """
+        return self.embedding_model.embed_documents(texts)
+    @weave.op()
+    def embed_query(self, text: str) -> list[float]:
+        """Embed a single query text and returns its embedding.
+        Args:
+            text (str): The query text to be embedded.
+        Returns:
+            List[float]: The embedding vector for the query text.
+        """
+        return self.embedding_model.embed_query(text)

src/rag_pipelines/embeddings/sparse_fastembed_qdrant.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Any, Optional
+import weave
+from langchain_qdrant.fastembed_sparse import FastEmbedSparse
+class SparseEmbeddings(weave.Model):
+    """Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
+    Attributes:
+        model_name (str): The name of the sparse embedding model to use.
+        model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
+        sparse_embedding_model (FastEmbedSparse): The initialized FastEmbedSparse model with the specified parameters.
+    """
+    def __init__(
+        self,
+        model_name: str = "prithvida/Splade_PP_en_v1",
+        model_kwargs: Optional[dict[str, Any]] = None,
+    ):
+        """Initialize the SparseEmbeddings class with the specified model and configurations.
+        Args:
+            model_name (str): The name of the sparse embedding model. Defaults to "prithvida/Splade_PP_en_v1".
+            model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
+        """
+        self.model_name = model_name
+        self.model_kwargs = model_kwargs if model_kwargs is not None else {}
+        # Initialize the sparse embedding model with specified parameters
+        self.sparse_embedding_model = FastEmbedSparse(model_name=self.model_name, **self.model_kwargs)
+    @weave.op()
+    def embed_texts(self, texts: list[str]) -> list[dict[str, float]]:
+        """Embed a list of texts and return their sparse embeddings.
+        Args:
+            texts (list[str]): A list of document texts to embed.
+        Returns:
+            list[dict[str, float]]: A list of sparse embedding dictionaries for each document text.
+                                    Each dictionary maps terms to their corresponding weights.
+        """
+        return self.sparse_embedding_model.embed_documents(texts)
+    @weave.op()
+    def embed_query(self, text: str) -> dict[str, float]:
+        """Embed a single query text and return its sparse embedding.
+        Args:
+            text (str): The query text to embed.
+        Returns:
+            dict[str, float]: A sparse embedding dictionary for the query text, where keys are terms
+                                and values are term weights.
+        """
+        return self.sparse_embedding_model.embed_query(text)

src/rag_pipelines/embeddings/sparse_milvus.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Any, Optional
+import weave
+from langchain_milvus.utils.sparse import BaseSparseEmbedding
+from spladerunner import Expander
+class SparseEmbeddingsMilvus(BaseSparseEmbedding):
+    """Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
+    Attributes:
+        model_name (str): The name of the sparse embedding model to use.
+        model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
+        sparse_embedding_model (FastEmbedSparse): The initialized FastEmbedSparse model with the specified parameters.
+    """
+    model_name: str
+    model_kwargs: Optional[dict[str, Any]] = None
+    sparse_embedding_model: Optional[Any] = None
+    def __init__(
+        self,
+        model_name: str = "Splade_PP_en_v1",
+        max_length: int = 512,
+    ):
+        """Initialize the SparseEmbeddings class with the specified model and configurations.
+        Args:
+            model_name (str): The name of the sparse embedding model. Defaults to "Splade_PP_en_v1".
+            model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization. Defaults to None.
+        """
+        self.model_name = model_name
+        self.max_length = max_length
+        # Initialize the sparse embedding model with specified parameters
+        self.sparse_embedding_model = Expander(model_name=self.model_name, max_length=self.max_length)
+    def _sparse_to_dict(self, sparse_vector: Any) -> dict[int, float]:
+        return dict(zip(sparse_vector["indices"], sparse_vector["values"]))
+    @weave.op()
+    def embed_query(self, text: str) -> dict[int, float]:
+        """Embed a single query text and return its sparse embedding.
+        Args:
+            text (str): The query text to embed.
+        Returns:
+            dict[int, float]: A sparse embedding dictionary for the query text, where keys are terms
+                                and values are term weights.
+        """
+        sparse_embeddings = list(self.sparse_embedding_model.expand([text]))
+        return self._sparse_to_dict(sparse_embeddings[0])
+    @weave.op()
+    def embed_documents(self, texts: list[str]) -> list[dict[int, float]]:
+        """Embed a list of texts and return their sparse embeddings.
+        Args:
+            texts (list[str]): A list of document texts to embed.
+        Returns:
+            list[dict[int, float]]: A list of sparse embedding dictionaries for each document text.
+                                    Each dictionary maps terms to their corresponding weights.
+        """
+        sparse_embeddings = list(self.sparse_embedding_model.expand(texts))
+        return [self._sparse_to_dict(sparse_embeddings[i]) for i in range(len(texts))]

src/rag_pipelines/embeddings/sparse_pinecone_text.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import Any, Optional
+import weave
+from pinecone_text.sparse import SpladeEncoder
+class SparseEmbeddingsSplade(weave.Model):
+    """Generate sparse embeddings for documents and queries using the FastEmbedSparse model.
+    Attributes:
+        model_kwargs (Optional[dict[str, Any]]): Additional configuration parameters for the model.
+        sparse_embedding_model (SpladeEncoder): The FastEmbedSparse model initialized with the specified parameters.
+    """
+    model_kwargs: Optional[dict[str, Any]]
+    sparse_embedding_model: Optional[SpladeEncoder] = None
+    def __init__(
+        self,
+        model_kwargs: Optional[dict[str, Any]] = None,
+    ):
+        """Initialize the SparseEmbeddings class with the specified model and configurations.
+        Args:
+            model_kwargs (Optional[dict[str, Any]]): Additional model configuration parameters for initialization.
+        """
+        super().__init__(model_kwargs=model_kwargs)
+        self.model_kwargs = model_kwargs if model_kwargs is not None else {}
+        # Initialize the sparse embedding model with specified parameters
+        self.sparse_embedding_model = SpladeEncoder(**self.model_kwargs)
+    @weave.op()
+    def embed_texts(self, texts: list[str]) -> list[dict[str, float]]:
+        """Embed a list of texts and return their sparse embeddings.
+        Args:
+            texts (list[str]): A list of document texts to embed.
+        Returns:
+            list[dict[str, float]]: A list of sparse embedding dictionaries for each document text.
+                                    Each dictionary maps terms to their corresponding weights.
+        """
+        return self.sparse_embedding_model.encode_documents(texts)
+    @weave.op()
+    def embed_query(self, text: str) -> dict[str, float]:
+        """Embed a single query text and return its sparse embedding.
+        Args:
+            text (str): The query text to embed.
+        Returns:
+            dict[str, float]: A sparse embedding dictionary for the query text, where keys are terms
+                                and values are term weights.
+        """
+        return self.sparse_embedding_model.encode_queries([text])

src/rag_pipelines/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from rag_pipelines.evaluation.evaluator import Evaluator
+from rag_pipelines.evaluation.response.answer_relevancy import AnswerRelevancyScorer
+from rag_pipelines.evaluation.response.faithfulness import FaithfulnessScorer
+from rag_pipelines.evaluation.response.hallucination import HallucinationScorer
+from rag_pipelines.evaluation.response.summarization import SummarizationScorer
+from rag_pipelines.evaluation.retrieval.contextual_precision import ContextualPrecisionScorer
+from rag_pipelines.evaluation.retrieval.contextual_recall import ContextualRecallScorer
+from rag_pipelines.evaluation.retrieval.contextual_relevancy import ContextualRelevancyScorer
+__all__ = [
+    "AnswerRelevancyScorer",
+    "ContextualPrecisionScorer",
+    "ContextualRecallScorer",
+    "ContextualRelevancyScorer",
+    "Evaluator",
+    "FaithfulnessScorer",
+    "HallucinationScorer",
+    "SummarizationScorer",
+]

src/rag_pipelines/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import asyncio
+from weave import Dataset, Evaluation, Model, Scorer
+class Evaluator:
+    """Evaluate a model on a dataset using a list of scorers.
+    Attributes:
+        evaluation_name (str): The name of the evaluation run.
+        evaluation_dataset (Dataset): The dataset used for evaluation.
+        evaluation_scorers (list[Scorer]): A list of scorer objects used to evaluate the pipeline.
+        pipeline (Model): The pipeline (model) to be evaluated.
+    """
+    def __init__(
+        self,
+        evaluation_name: str,
+        evaluation_dataset: Dataset,
+        evaluation_scorers: list[Scorer],
+        pipeline: Model,
+    ):
+        """Initialize the Evaluator instance with the specified evaluation parameters.
+        Args:
+            evaluation_name (str): A unique identifier for the evaluation run.
+            evaluation_dataset (Dataset): A `Dataset` object representing the data for evaluation.
+            evaluation_scorers (list[Scorer]): A list of `Scorer` objects that calculate various metrics.
+            pipeline (Model): The model or pipeline to evaluate.
+        """
+        self.evaluation_name = evaluation_name
+        self.evaluation_dataset = evaluation_dataset
+        self.evaluation_scorers = evaluation_scorers
+        self.pipeline = pipeline
+    def evaluate(self) -> None:
+        """Perform evaluation of the pipeline using the specified dataset and scorers.
+        This method creates an `Evaluation` object, executes the evaluation process, and
+        returns the results as a dictionary.
+        """
+        evaluation = Evaluation(
+            evaluation_name=self.evaluation_name,
+            dataset=self.evaluation_dataset,
+            scorers=self.evaluation_scorers,
+        )
+        try:
+            evaluation_results = asyncio.run(evaluation.evaluate(self.pipeline))
+        except Exception as exception:
+            msg = f"Evaluation run failed: {exception}"
+            raise RuntimeError(msg) from exception
+        return evaluation_results

src/rag_pipelines/evaluation/response/__init__.py ADDED Viewed

File without changes

src/rag_pipelines/evaluation/response/answer_relevancy.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from statistics import variance
+from typing import Optional, Union
+import numpy as np
+import weave
+from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class AnswerRelevancyScorer(Scorer):
+    """Evaluate the relevancy of answers generated by a LLM.
+    This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
+    compared to the input query.
+    The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
+    actual output of an LLM application is in relation to the input query.
+    Attributes:
+        threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
+        model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
+        include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
+        strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
+                            1. Defaults to False.
+        async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
+        verbose (bool): Whether to print intermediate steps to the console, defaults to False.
+        metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    metric: AnswerRelevancyMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        async_mode: bool = True,
+        verbose: bool = False,
+    ):
+        """Initialize the AnswerRelevancy Scorer with the specified parameters.
+        Args:
+            threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
+            model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
+            include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
+            strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
+            async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
+            verbose (bool): Whether to print intermediate steps to the console, defaults to False.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.metric = AnswerRelevancyMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        output: Optional[dict] = None,
+        expected_output: Optional[str] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the relevancy and accuracy of answers generated by a LLM.
+        The AnswerRelevancy score is calculated according to the following equation:
+        Answer Relevancy = Total Number of Statements / Number of Relevant Statements
+        The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            output (dict): The LLM generated response to evaluate and the retrieval context.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed answer relevancy score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=output.get("output", ""),
+            expected_output=expected_output,
+            retrieval_context=output.get("retrieval_context", [""]),
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score}
+        return result
+    @weave.op()
+    def summarize(self, score_rows: list) -> dict:
+        """Summarize the results of the AnswerRelevancy Scorer.
+        Args:
+            score_rows (list): A list of dictionaries containing the following keys:
+                - "score" (float): The computed answer relevancy score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        Returns:
+            dict: A dictionary containing the following keys:
+                - "answer_relevancy_score" (dict): A dictionary containing the following keys:
+                    - "score" (float): The average answer relevancy score.
+                    - "variance" (float): The variance of the answer relevancy scores.
+                    - "std" (float): The standard deviation of the answer relevancy scores.
+                    - "count" (int): The number of answer relevancy scores.
+        """
+        scores = []
+        for row in score_rows:
+            score = row.get("score", 0.0)
+            scores.append(float(score))
+        score = np.mean(scores).item()
+        variance = np.var(scores).item()
+        std = np.std(scores).item()
+        count = len(scores)
+        return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}

src/rag_pipelines/evaluation/response/faithfulness.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from typing import Optional, Union
+import weave
+from deepeval.metrics import FaithfulnessMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class FaithfulnessScorer(Scorer):
+    """Evaluate the faithfulness of LLM generated outputs.
+    This scorer uses DeepEval's `Faithfulness` Metric.
+    The faithfulness metric measures the quality of your LLM generation by evaluating whether the `actual_output` factually aligns with the contents of your `retrieval_context`.
+    Attributes:
+        threshold (float): The minimum score required to pass the faithfulness check, defaults to 0.5.
+        model (str): The LLM model used for evaluation, defaults to "gpt-4".
+        include_reason (bool): Whether to include an explanation for the assigned score, defaults to True.
+        strict_mode (bool): When True, enforces binary scoring (1 for perfect alignment, 0 otherwise).
+            Overrides the threshold to 1. Defaults to False.
+        async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
+        verbose (bool): Whether to display intermediate steps during metric computation, defaults to False.
+        truths_extraction_limit (Optional[int]): Limits the number of key facts to extract from the retrieval
+            context for evaluation, ordered by importance. Defaults to None.
+        metric (FaithfulnessMetric): An instance of DeepEval's `FaithfulnessMetric` for scoring.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    truths_extraction_limit: Optional[int] = Optional[None]
+    metric: FaithfulnessMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        async_mode: bool = True,
+        verbose: bool = False,
+        truths_extraction_limit: Optional[int] = None,
+    ):
+        """Initialize the Faithfulness Scorer with DeepEval's Faithfulness Metric.
+        Args:
+            threshold (float): The minimum score required to pass the faithfulness check, defaults to 0.5.
+            model (str): The LLM model used for evaluation, defaults to "gpt-4".
+            include_reason (bool): Whether to include an explanation for the assigned score, defaults to True.
+            strict_mode (bool): Enforces binary scoring (1 for perfect alignment, 0 otherwise).
+                Overrides the threshold to 1. Defaults to False.
+            async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
+            verbose (bool): Whether to display intermediate steps during metric computation, defaults to False.
+            truths_extraction_limit (Optional[int]): Limits the number of key facts to extract from the retrieval
+                context for evaluation, ordered by importance. Defaults to None.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+            truths_extraction_limit=truths_extraction_limit,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.truths_extraction_limit = truths_extraction_limit
+        self.metric = FaithfulnessMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        actual_output: str,
+        expected_output: Optional[str] = None,
+        retrieval_context: Optional[list[str]] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the faithfulness of an LLM generated response.
+        Faithfulness is calculated as:
+        Faithfulness = (Number of Truthful Claims) / (Total Number of Claims).
+        The Faithfulness Metric evaluates all claims in the `actual_output` and checks
+        whether they are truthful based on the facts in the `retrieval_context`. Claims
+        are marked truthful if they align with or do not contradict any facts in the context.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            actual_output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed faithfulness score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=actual_output,
+            expected_output=expected_output,
+            retrieval_context=retrieval_context,
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score, "reason": self.metric.reason}
+        return result

src/rag_pipelines/evaluation/response/hallucination.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from typing import Optional, Union
+import weave
+from deepeval.metrics import HallucinationMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class HallucinationScorer(Scorer):
+    """Evaluate the factual alignment of the generated output with the provided context.
+    This scorer uses DeepEval's `Hallucination` Metric to assess how well the generated output
+    aligns with the reference context.
+    The Hallucination metric determines whether your LLM generates factually correct information by comparing the `actual_output` to the provided `context`.
+    Attributes:
+        threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+        model (str): The LLM model to use for scoring, defaults to "gpt-4".
+        include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+        strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+        0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+        async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+        verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+        to False.
+        metric (HallucinationMetric): The DeepEval HallucinationMetric.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    metric: HallucinationMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = True,
+        async_mode: bool = True,
+        verbose: bool = False,
+    ):
+        """Initialize the Hallucination scorer using DeepEval's Hallucination Metric.
+        Args:
+            threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+            model (str): The LLM model to use for scoring, defaults to "gpt-4".
+            include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+            strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+            0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+            async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+            verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+            to False.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.metric = HallucinationMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        actual_output: str,
+        expected_output: Optional[str] = None,
+        retrieval_context: Optional[list[str]] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the factual alignment of the generated output with the provided context.
+        The Hallucination Score is calculated according to the following equation:
+        Hallucination = Number of Contradicted Contexts / Total Number of Contexts
+        The Hallucination Score uses an LLM to determine, for each context in `contexts`, whether there are any contradictions to the `actual_output`.
+        Although extremely similar to the Faithfulness Scorer, the Hallucination Score is calculated differently since it uses `contexts` as the source of truth instead. Since `contexts` is the ideal segment of your knowledge base relevant to a specific input, the degree of hallucination can be measured by the degree of which the `contexts` is disagreed upon.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            actual_output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed hallucination score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=actual_output,
+            expected_output=expected_output,
+            retrieval_context=retrieval_context,
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score, "reason": self.metric.reason}
+        return result

src/rag_pipelines/evaluation/response/phoenix_hallucination.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from typing import Optional, Union
+import weave
+from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class AnswerRelevancyScorer(Scorer):
+    """Evaluate the relevancy of answers generated by a LLM.
+    This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
+    compared to the input query.
+    The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
+    actual output of an LLM application is in relation to the input query.
+    Attributes:
+        threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
+        model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
+        include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
+        strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
+                            1. Defaults to False.
+        async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
+        verbose (bool): Whether to print intermediate steps to the console, defaults to False.
+        metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
+    """
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        async_mode: bool = True,
+        verbose: bool = False,
+    ):
+        """Initialize the AnswerRelevancy Scorer with the specified parameters.
+        Args:
+            threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
+            model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
+            include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
+            strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
+            async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
+            verbose (bool): Whether to print intermediate steps to the console, defaults to False.
+        """
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.metric = AnswerRelevancyMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        actual_output: str,
+        expected_output: Optional[str] = None,
+        retrieval_context: Optional[list[str]] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the relevancy and accuracy of answers generated by a LLM.
+        The AnswerRelevancy score is calculated according to the following equation:
+        Answer Relevancy = Total Number of Statements / Number of Relevant Statements
+        The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            actual_output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed answer relevancy score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=actual_output,
+            expected_output=expected_output,
+            retrieval_context=retrieval_context,
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score, "reason": self.metric.reason}
+        return result

src/rag_pipelines/evaluation/response/summarization.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from typing import Optional, Union
+import weave
+from deepeval.metrics import SummarizationMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class SummarizationScorer(Scorer):
+    """Summarization Scorer.
+    This scorer uses DeepEval's `Summarization` Metric to assess how well the generated output
+    aligns with the reference context.
+    The summarization metric uses LLMs to determine whether the LLM application is generating factually correct
+    summaries while including the neccessary details from the original text.
+    Attributes:
+        threshold (float): Minimum passing threshold, defaults to 0.5.
+        model (str): LLM model for scoring, defaults to "gpt-4".
+        assessment_questions: a list of close-ended questions that can be answered with either a 'yes' or a 'no'.
+        These are questions you want your summary to be able to ideally answer,
+        and is especially helpful if you already know what a good summary for your use case looks like. If
+        include_reason (bool): Include reason for the evaluation score, defaults to True.
+        strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
+        async_mode (bool): Use asynchronous scoring, defaults to True.
+        verbose (bool): Print intermediate steps used for scoring, defaults to False.
+        truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
+            from the retrieval_context. Defaults to None.
+        metric (SummarizationMetric): An instance of DeepEval's `SummarizationMetric` for scoring.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    assessment_questions: Optional[list[str]] = Optional[None]
+    n: Optional[int] = Optional[None]
+    truths_extraction_limit: Optional[int] = Optional[None]
+    metric: SummarizationMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        async_mode: bool = True,
+        verbose: bool = False,
+        assessment_questions: Optional[list[str]] = None,
+        n: Optional[int] = 5,
+        truths_extraction_limit: Optional[int] = None,
+    ):
+        """Initialize the Summarization Scorer with DeepEval's Summarization Metric.
+        Args:
+            threshold (float): Minimum passing threshold, defaults to 0.5.
+            model (str): LLM model for scoring, defaults to "gpt-4".
+            include_reason (bool): Include reason for the evaluation score, defaults to True.
+            strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
+            async_mode (bool): Use asynchronous scoring, defaults to True.
+            verbose (bool): Print intermediate steps used for scoring, defaults to False.
+            assessment_questions (Optional[list[str]]): a list of close-ended questions that can be answered with either
+                a 'yes' or a 'no'. These are questions you want your summary to be able to ideally answer, and is
+                especially helpful if you already know what a good summary for your use case looks like. If
+                `assessment_questions` is not provided, the metric will generate a set of `assessment_questions` at
+                evaluation time.
+            n (Optional[int]): The number of assessment questions to generate when `assessment_questions` is not
+                provided. Defaults to 5.
+            truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
+                from the retrieval_context. Defaults to None.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+            assessment_questions=assessment_questions,
+            n=n,
+            truths_extraction_limit=truths_extraction_limit,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.assessment_questions = assessment_questions
+        self.n = n
+        self.truths_extraction_limit = truths_extraction_limit
+        self.metric = SummarizationMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+            assessment_questions=self.assessment_questions,
+            n=self.n,
+            truths_extraction_limit=self.truths_extraction_limit,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        actual_output: str,
+        expected_output: Optional[str] = None,
+        retrieval_context: Optional[list[str]] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the quality of summarization of an LLM generated response.
+        The Summarization score is calculated according to the following equation:
+        Summarization = min(Alignment Score, Coverage Score)
+        where,
+        - Alignment Score: determines whether the summary contains hallucinated or contradictory information to the original text.
+        - Coverage Score: determines whether the summary contains the neccessary information from the original text.
+        While the Alignment Score is similar to that of the Hallucination Score, the Coverage Score is first calculated
+        by generating n closed-ended questions that can only be answered with either a 'yes or a 'no', before
+        calculating the ratio of which the original text and summary yields the same answer.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            actual_output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed summarization score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=actual_output,
+            expected_output=expected_output,
+            retrieval_context=retrieval_context,
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score, "reason": self.metric.reason}
+        return result

src/rag_pipelines/evaluation/retrieval/__init__.py ADDED Viewed

File without changes

src/rag_pipelines/evaluation/retrieval/contextual_precision.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from typing import Optional, Union
+import numpy as np
+import weave
+from deepeval.metrics import ContextualPrecisionMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class ContextualPrecisionScorer(Scorer):
+    """Evaluate the contextual precision of the generated output with the provided context.
+    This scorer uses DeepEval's `Contextual Precision` Metric to assess how well the generated output
+    aligns with the reference context.
+    The contextual precision metric measures the quality of the pipeline's retriever by evaluating whether results in the `retrieval_context` that are relevant to the given input are ranked higher than irrelevant ones.
+    Attributes:
+        threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+        model (str): The LLM model to use for scoring, defaults to "gpt-4".
+        include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+        strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+        0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+        async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+        verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+        to False.
+        metric (ContextualPrecisionMetric): The DeepEval ContextualPrecisionMetric.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    metric: ContextualPrecisionMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = True,
+        async_mode: bool = True,
+        verbose: bool = False,
+    ):
+        """Initialize the Contextual Precision Scorer using DeepEval's Contextual Precision Metric.
+        Args:
+            threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+            model (str): The LLM model to use for scoring, defaults to "gpt-4".
+            include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+            strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+            0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+            async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+            verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+            to False.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.metric = ContextualPrecisionMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        output: Optional[dict] = None,
+        expected_output: Optional[str] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the contextual precision of the generated output with the provided context.
+        The Contextual Precision Score is calculated according to the following equation:
+        Contextual Precision = (1 / Number of Relevant Results) * (Sum(Number of Relevant Results up to position k) / k) * Binary Relevance of k'th result)
+        where,
+        - k: The position of the result in the list of all results.
+        The Contextual Precision Scorer first uses an LLM to determine for each result in the `retrieval_context`
+        whether it is relevant to the input based on information in the `expected_output`, before calculating the
+        weighted cumulative precision as the contextual precision score.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed contextual precision score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=output.get("output", ""),
+            expected_output=expected_output,
+            retrieval_context=output.get("retrieval_context", [""]),
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {
+            "score": self.metric.score,
+        }
+        return result
+    @weave.op()
+    def summarize(self, score_rows: list) -> dict:
+        """Summarize the results of the Contextual Precision Scorer.
+        Args:
+            score_rows (list): A list of dictionaries containing the following keys:
+                - "score" (float): The computed answer relevancy score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        Returns:
+            dict: A dictionary containing the following keys:
+                - "answer_relevancy_score" (dict): A dictionary containing the following keys:
+                    - "score" (float): The average answer relevancy score.
+                    - "variance" (float): The variance of the answer relevancy scores.
+                    - "std" (float): The standard deviation of the answer relevancy scores.
+                    - "count" (int): The number of answer relevancy scores.
+        """
+        scores = []
+        for row in score_rows:
+            score = row.get("score", 0.0)
+            scores.append(float(score))
+        score = np.mean(scores).item()
+        variance = np.var(scores).item()
+        std = np.std(scores).item()
+        count = len(scores)
+        return {"contextual_precision_score": {"score": score, "variance": variance, "std": std, "count": count}}

src/rag_pipelines/evaluation/retrieval/contextual_recall.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from typing import Optional, Union
+import weave
+from deepeval.metrics import ContextualRecallMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class ContextualRecallScorer(Scorer):
+    """Evaluate the contextual recall of the generated output with the provided context.
+    This scorer uses DeepEval's `ContextualRecall` Metric to assess how well the generated output
+    aligns with the reference context.
+    The contextual recall metric measures the quality of the pipeline's retriever by evaluating the extent of which the `retrieval_context` aligns with the `expected_output`.
+    Attributes:
+        threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+        model (str): The LLM model to use for scoring, defaults to "gpt-4".
+        include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+        strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+        0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+        async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+        verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+        to False.
+        metric (ContextualRecallMetric): The DeepEval ContextualRecallMetric.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    metric: ContextualRecallMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = True,
+        async_mode: bool = True,
+        verbose: bool = False,
+    ):
+        """Initialize the Contextual Recall Scorer using DeepEval's Contextual Recall Metric.
+        Args:
+            threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+            model (str): The LLM model to use for scoring, defaults to "gpt-4".
+            include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+            strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+            0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+            async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+            verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+            to False.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.metric = ContextualRecallMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        actual_output: str,
+        expected_output: Optional[str] = None,
+        retrieval_context: Optional[list[str]] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the contextual recall of the generated output with the provided context.
+        The Contextual Recall Score is calculated according to the following equation:
+        Contextual Recall = Number of Attributable Results / Total Number of Results
+        he Contextual Recall Scorer first uses an LLM to extract all statements made in the `expected_output`, before using the same LLM to classify whether each statement can be attributed to results in the `retrieval_context`.
+        A higher contextual recall score represents a greater ability of the retrieval system to capture all relevant information from the total available relevant set within your knowledge base.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            actual_output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed contextual recall score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=actual_output,
+            expected_output=expected_output,
+            retrieval_context=retrieval_context,
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score, "reason": self.metric.reason}
+        return result

src/rag_pipelines/evaluation/retrieval/contextual_relevancy.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from typing import Optional, Union
+import weave
+from deepeval.metrics import ContextualRelevancyMetric
+from deepeval.test_case import LLMTestCase
+from weave import Scorer
+class ContextualRelevancyScorer(Scorer):
+    """Evaluate the contextual relevancy of the generated output with the provided context.
+    This scorer uses DeepEval's `ContextualRelevancy` Metric to assess how well the generated output
+    aligns with the reference context.
+    The contextual relevancy metric measures the quality of the RAG pipeline's retriever by evaluating the overall relevance of the information presented in the `retrieval_context` for a given input.
+    Attributes:
+        threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+        model (str): The LLM model to use for scoring, defaults to "gpt-4".
+        include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+        strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+        0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+        async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+        verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+        to False.
+        metric (ContextualRelevancyMetric): The DeepEval ContextualRelevancyMetric.
+    """
+    threshold: float = Optional[None]
+    model: str = Optional[None]
+    include_reason: bool = Optional[None]
+    strict_mode: bool = Optional[None]
+    async_mode: bool = Optional[None]
+    verbose: bool = Optional[None]
+    metric: ContextualRelevancyMetric = Optional[None]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: str = "gpt-4",
+        include_reason: bool = True,
+        strict_mode: bool = True,
+        async_mode: bool = True,
+        verbose: bool = False,
+    ):
+        """Initialize the Contextual Relevancy Scorer using DeepEval's Contextual Relevancy Metric.
+        Args:
+            threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
+            model (str): The LLM model to use for scoring, defaults to "gpt-4".
+            include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
+            strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
+            0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
+            async_mode (bool): Whether to use asynchronous scoring, defaults to True.
+            verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
+            to False.
+        """
+        super().__init__(
+            threshold=threshold,
+            model=model,
+            include_reason=include_reason,
+            strict_mode=strict_mode,
+            async_mode=async_mode,
+            verbose=verbose,
+        )
+        self.threshold = threshold
+        self.model = model
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose = verbose
+        self.metric = ContextualRelevancyMetric(
+            threshold=self.threshold,
+            model=self.model,
+            include_reason=self.include_reason,
+            async_mode=self.async_mode,
+            strict_mode=self.strict_mode,
+            verbose_mode=self.verbose,
+        )
+    @weave.op
+    def score(
+        self,
+        input: str,
+        actual_output: str,
+        expected_output: Optional[str] = None,
+        retrieval_context: Optional[list[str]] = None,
+        context: Optional[list[str]] = None,
+    ) -> dict[str, Union[str, float]]:
+        """Evaluate the contextual relevancy of the generated output with the provided context.
+        The Contextual Relevancy Score is calculated according to the following equation:
+        Contextual Relevancy = Number of Relevant Results / Total Number of Results
+        Although similar to how the Answer Relevancy Score is calculated, the Contextual Relevancy Metric first uses an LLM to extract all statements made in the `retrieval_context` instead, before using the same LLM to classify whether each statement is relevant to the input.
+        Args:
+            input (str): The input query or prompt that triggered the output.
+            actual_output (str): The LLM generated response to evaluate.
+            expected_output (Optional[str]): The expected or reference output, defaults to None.
+            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
+            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.
+        Returns:
+            dict[str, Union[str, float]]: A dictionary containing:
+                - "score" (float): The computed contextual relevancy score.
+                - "reason" (str): A detailed explanation for the assigned score.
+        """
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=actual_output,
+            expected_output=expected_output,
+            retrieval_context=retrieval_context,
+            context=context,
+        )
+        result: dict[str, Union[str, float]] = {}
+        self.metric.measure(test_case)
+        result = {"score": self.metric.score, "reason": self.metric.reason}
+        return result

src/rag_pipelines/llms/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from rag_pipelines.llms.groq import ChatGroqGenerator
2	+
3	+ __all__ = ["ChatGroqGenerator"]

src/rag_pipelines/llms/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (256 Bytes). View file

src/rag_pipelines/llms/__pycache__/groq.cpython-310.pyc ADDED Viewed

Binary file (3.63 kB). View file

src/rag_pipelines/llms/groq.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+from typing import Any, Optional
+import weave
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_groq import ChatGroq
+from pydantic import BaseModel
+from rag_pipelines.prompts import STRUCTURED_RAG_PROMPT, RAGResponseModel
+class ChatGroqGenerator:
+    """Interact with the ChatGroq model to generate responses based on user queries and documents.
+    This class provides an interface for generating responses using the ChatGroq model.
+    It handles prompt formatting, LLM invocation, document integration, and result generation.
+    """
+    model: str
+    api_key: str
+    llm_params: dict[str, Any]
+    llm: Optional[ChatGroq] = None
+    structured_output_model: BaseModel
+    system_prompt: str
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        llm_params: Optional[dict[str, Any]] = None,
+        structured_output_model: BaseModel = RAGResponseModel,
+        system_prompt: str = STRUCTURED_RAG_PROMPT,
+    ):
+        """Initialize the ChatGroqGenerator with configuration parameters.
+        Args:
+            model (str): The name of the ChatGroq model to use.
+            api_key (Optional[str]): API key for the ChatGroq service. If not provided,
+                the "GROQ_API_KEY" environment variable will be used.
+            llm_params (Optional[dict]): Additional parameters for configuring the ChatGroq model.
+            structured_output_model (BaseModel): The output model for structured responses.
+            system_prompt (str): The system prompt for the ChatGroq model.
+        Raises:
+            ValueError: If the API key is not provided and the "GROQ_API_KEY" environment variable is not set.
+        """
+        if llm_params is None:
+            llm_params = {}
+        api_key = api_key or os.environ.get("GROQ_API_KEY")
+        if api_key is None:
+            msg = "GROQ_API_KEY is not set. Please provide an API key or set it as an environment variable."
+            raise ValueError(msg)
+        self.model = model
+        self.api_key = api_key
+        self.llm_params = llm_params
+        self.structured_output_model = structured_output_model
+        self.system_prompt = system_prompt
+        self.llm = ChatGroq(model=self.model, api_key=self.api_key, **llm_params)
+    @weave.op()
+    def __call__(self, state: dict[str, Any]) -> dict[str, Any]:
+        """Generate a response using the current state of user prompts and graded documents.
+        Args:
+            state (dict[str, Any]): The current state, containing:
+                - 'question': The user question.
+                - 'context': A list of filtered document texts.
+                - 'documents': A list of retrieved documents.
+        Returns:
+            dict[str, Any]: A dictionary containing:
+                - 'question': The user question.
+                - 'context': A list of filtered document texts.
+                - 'documents': A list of retrieved documents.
+                - 'answer': The generated response.
+        """
+        question = state["question"]
+        context = state["context"]
+        documents = state["documents"]
+        formatted_context = "\n".join(context)
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", self.system_prompt),
+            ]
+        )
+        response_chain = prompt | self.llm.with_structured_output(self.structured_output_model)
+        response = response_chain.invoke({"question": question, "context": formatted_context})
+        answer = response.final_answer
+        return {"question": question, "context": context, "documents": documents, "answer": answer}

src/rag_pipelines/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from rag_pipelines.pipelines.self_rag import SelfRAGPipeline
2	+
3	+ __all__ = ["SelfRAGPipeline"]

src/rag_pipelines/pipelines/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (268 Bytes). View file

src/rag_pipelines/pipelines/__pycache__/self_rag.cpython-310.pyc ADDED Viewed

Binary file (5.1 kB). View file

src/rag_pipelines/pipelines/__pycache__/self_rag_graph_state.cpython-310.pyc ADDED Viewed

Binary file (1.09 kB). View file

src/rag_pipelines/pipelines/adaptive_rag.py ADDED Viewed

File without changes

src/rag_pipelines/pipelines/adaptive_rag_graph_state.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from langchain_core.documents import Document
+from typing_extensions import TypedDict
+class AdaptiveRAGGraphState(TypedDict):
+    """Represents the state of the graph for the Adaptive Retrieval-Augmentation-Generation (Adaptive-RAG) pipeline.
+    Attributes:
+        question (str): The input question for the pipeline.
+        answer (str): The generated response from the LLM.
+        documents (list[Document]): A list of LangChain documents that are retrieved and processed through the pipeline.
+        context (list[str]): The final list of context documents passed to the LLM for generating the answer.
+    """
+    question: str
+    answer: str
+    documents: list[Document]
+    context: list[str]

src/rag_pipelines/pipelines/crag.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+from typing import Any, Optional
+import weave
+from langchain_community.retrievers import PineconeHybridSearchRetriever
+from langchain_core.prompts.chat import ChatPromptTemplate
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.state import CompiledStateGraph
+from weave.integrations.langchain import WeaveTracer
+from rag_pipelines.llms.groq import ChatGroqGenerator
+from rag_pipelines.pipelines.crag_graph_state import CRAGGraphState
+from rag_pipelines.query_transformer import QueryTransformer
+from rag_pipelines.retrieval_evaluator import DocumentGrader, QueryDecisionMaker
+from rag_pipelines.websearch import WebSearch
+# Disable global tracing explicitly
+os.environ["WEAVE_TRACE_LANGCHAIN"] = "false"
+class CorrectiveRAGPipeline(weave.Model):
+    """A corrective retrieval-augmented generation (RAG) pipeline using Weave for tracing and LangChain components.
+    This pipeline integrates document retrieval, relevance evaluation, grading, query transformation, web search,
+    and LLM-based response generation to implement a corrective RAG system. It utilizes Weave for tracing execution
+    details and LangChain components for processing.
+    Attributes:
+        retriever (Optional[PineconeHybridSearchRetriever]): The retrieval model used to fetch relevant documents based on a query.
+        prompt (Optional[ChatPromptTemplate]): The prompt template to generate questions for the LLM.
+        generator (Optional[ChatGroqGenerator]): The language model used to generate responses.
+        grader (Optional[DocumentGrader]): Grades documents based on evaluation results.
+        query_transformer (Optional[QueryTransformer]): Transforms user queries to optimize retrieval.
+        web_search (Optional[WebSearch]): Performs web search for additional context.
+        tracing_project_name (str): The name of the Weave project for tracing.
+        weave_params (Dict[str, Any]): Parameters for initializing Weave.
+        tracer (Optional[WeaveTracer]): The tracer used to record execution details with Weave.
+    """
+    retriever: Optional[PineconeHybridSearchRetriever] = None
+    prompt: Optional[ChatPromptTemplate] = None
+    generator: Optional[ChatGroqGenerator] = None
+    grader: Optional[DocumentGrader] = None
+    query_transformer: Optional[QueryTransformer] = None
+    web_search: Optional[WebSearch] = None
+    tracing_project_name: str
+    weave_params: dict[str, Any]
+    tracer: Optional[WeaveTracer] = None
+    def __init__(
+        self,
+        retriever: PineconeHybridSearchRetriever,
+        prompt: ChatPromptTemplate,
+        generator: ChatGroqGenerator,
+        grader: DocumentGrader,
+        query_transformer: QueryTransformer,
+        web_search: WebSearch,
+        tracing_project_name: str = "corrective_rag",
+        weave_params: Optional[dict[str, Any]] = None,
+    ):
+        """Initialize the CorrectiveRAGPipeline.
+        Args:
+            retriever (PineconeHybridSearchRetriever): The retrieval model used to fetch documents for the RAG pipeline.
+            prompt (ChatPromptTemplate): The prompt template used to create questions for the LLM.
+            generator (ChatGroqGenerator): The language model used for response generation.
+            grader (DocumentGrader): Component to grade the relevance of evaluated documents.
+            query_transformer (QueryTransformer): Component to transform the user query.
+            web_search (WebSearch): Component to perform web search for additional context.
+            tracing_project_name (str): The name of the Weave project for tracing. Defaults to "corrective_rag".
+            weave_params (Dict[str, Any]): Additional parameters for initializing Weave.
+        """
+        if weave_params is None:
+            weave_params = {}
+        super().__init__(
+            retriever=retriever,
+            prompt=prompt,
+            generator=generator,
+            grader=grader,
+            query_transformer=query_transformer,
+            web_search=web_search,
+            tracing_project_name=tracing_project_name,
+            weave_params=weave_params,
+        )
+        self.retriever = retriever
+        self.prompt = prompt
+        self.generator = generator
+        self.grader = grader
+        self.query_transformer = query_transformer
+        self.web_search = web_search
+        self.tracing_project_name = tracing_project_name
+        self.weave_params = weave_params
+        self._initialize_weave(**weave_params)
+    def _initialize_weave(self, **weave_params) -> None:
+        """Initialize Weave with the specified tracing project name.
+        Sets up the Weave environment and creates a tracer for monitoring pipeline execution.
+        Args:
+            weave_params (Dict[str, Any]): Additional parameters for configuring Weave.
+        """
+        weave.init(self.tracing_project_name, **weave_params)
+        self.tracer = WeaveTracer()
+    def _build_crag_graph(self) -> CompiledStateGraph:
+        """Build and compile the corrective RAG workflow graph.
+        The graph defines the flow between components like retrieval, grading, query transformation,
+        web search, and generation.
+        Returns:
+            CompiledStateGraph: The compiled state graph representing the corrective RAG pipeline workflow.
+        """
+        crag_workflow = StateGraph(CRAGGraphState)
+        # Define the nodes
+        crag_workflow.add_node("retrieve", self.retriever)
+        crag_workflow.add_node("grade_documents", self.grader)
+        crag_workflow.add_node("generate", self.generator)
+        crag_workflow.add_node("transform_query", self.query_transformer)
+        crag_workflow.add_node("web_search_node", self.web_search)
+        # Define edges between nodes
+        crag_workflow.add_edge(START, "retrieve")
+        crag_workflow.add_edge("retrieve", "grade_documents")
+        crag_workflow.add_conditional_edges(
+            "grade_documents",
+            QueryDecisionMaker(),
+            {
+                "transform_query": "transform_query",
+                "generate": "generate",
+            },
+        )
+        crag_workflow.add_edge("transform_query", "web_search_node")
+        crag_workflow.add_edge("web_search_node", "generate")
+        crag_workflow.add_edge("generate", END)
+        # Compile the graph
+        crag_pipeline = crag_workflow.compile()
+        return crag_pipeline
+    @weave.op()
+    def predict(self, question: str) -> str:
+        """Execute the corrective RAG pipeline with the given question.
+        The pipeline retrieves documents, evaluates and grades their relevance, and generates a final response
+        using the LLM.
+        Args:
+            question (str): The input question to be answered.
+        Returns:
+            str: The final answer generated by the LLM.
+        Example:
+            ```python
+            pipeline = CorrectiveRAGPipeline(...)
+            answer = pipeline.predict("What are the latest AI trends?")
+            print(answer)
+            ```
+        """
+        config = {"callbacks": [self.tracer]}
+        crag_graph = self._build_crag_graph()
+        response = crag_graph.invoke(question, config=config)
+        return response

src/rag_pipelines/pipelines/crag_graph_state.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing_extensions import TypedDict
+class CRAGGraphState(TypedDict):
+    """Represents the state of the graph for the Corrective Retrieval-Augmentation-Generation (CRAG) pipeline.
+    Attributes:
+        question (str): The input question for the pipeline.
+        generation (str): The generated response from the LLM.
+        web_search (str): Indicates whether a web search is required (e.g., "yes" or "no").
+        documents (List[str]): A list of relevant documents retrieved or processed.
+    """
+    question: str
+    generation: str
+    web_search: str
+    documents: list[str]

src/rag_pipelines/pipelines/dspy/dspy_baseline_rag.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import argparse
+import dspy
+from datasets import load_dataset
+from dspy_modules.evaluator import DSPyEvaluator
+from dspy_modules.rag import DSPyRAG
+from dspy_modules.weaviate_db import WeaviateVectorDB
+def main(cluster_url, api_key, index_name, model_name, llm_model, llm_api_key):
+    # Load dataset
+    earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+    questions = earnings_calls_data["question"]
+    # Split into datasets
+    [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
+    # Initialize Weaviate VectorDB
+    weaviate_db = WeaviateVectorDB(cluster_url, api_key, index_name, model_name)
+    # Initialize LLM
+    llm = dspy.LM(llm_model, api_key=llm_api_key, num_retries=120)
+    dspy.configure(lm=llm)
+    # Initialize RAG
+    rag = DSPyRAG(weaviate_db)
+    # Evaluate before compilation
+    evaluator = DSPyEvaluator()
+    evaluate = dspy.Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(rag, metric=evaluator.llm_metric)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run DSPy-based RAG pipeline")
+    parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL")
+    parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key")
+    parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name")
+    parser.add_argument("--model_name", type=str, required=True, help="Embedding model name")
+    parser.add_argument("--llm_model", type=str, required=True, help="LLM model name")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="LLM API key")
+    args = parser.parse_args()
+    main(args.cluster_url, args.api_key, args.index_name, args.model_name, args.llm_model, args.llm_api_key)

src/rag_pipelines/pipelines/dspy/dspy_bayesian_signature_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import argparse
+import dspy
+import weaviate
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.primitives.prediction import Prediction
+from dspy.teleprompt import BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_weaviate.vectorstores import WeaviateVectorStore
+from weaviate.classes.init import Auth
+# Argument Parser
+parser = argparse.ArgumentParser(description="RAG Optimization with DSPy")
+parser.add_argument(
+    "--optimizer",
+    type=str,
+    choices=["bootstrap", "bayesian"],
+    default="bootstrap",
+    help="Choose the optimization method",
+)
+args = parser.parse_args()
+# Load dataset
+earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+questions = earnings_calls_data["question"]
+# Create DSPy datasets
+trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+testset = [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
+# Embeddings and Weaviate client
+embeddings = HuggingFaceEmbeddings(
+    model_name="jinaai/jina-embeddings-v3",
+    model_kwargs={"device": "cpu", "trust_remote_code": True},
+    encode_kwargs={"task": "retrieval.query", "prompt_name": "retrieval.query"},
+)
+weaviate_client = weaviate.connect_to_weaviate_cloud(
+    cluster_url="https://adrrwus9shkxkuijvazcrq.c0.us-west3.gcp.weaviate.cloud",
+    auth_credentials=Auth.api_key("J94gHySMWTWxggDDayGrF2ESGo23yOHZ1bUC"),
+)
+weaviate_db = WeaviateVectorStore(
+    index_name="LangChain_d73ad6159d514fec887456fa6db11e61",
+    embedding=embeddings,
+    client=weaviate_client,
+    text_key="text",
+)
+# Configure LLM
+llm = dspy.LM(
+    "groq/llama-3.3-70b-versatile",
+    api_key="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",
+    num_retries=120,
+)
+dspy.configure(lm=llm)
+# Define DSPy Module
+class GenerateAnswer(dspy.Signature):
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="short and precise answer")
+class RAG(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+    def retrieve(self, question):
+        results = weaviate_db.similarity_search(query=question)
+        passages = [res.page_content for res in results]
+        return Prediction(passages=passages)
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        prediction = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=prediction.answer)
+# Define LLM Metric
+def llm_metric(gold, pred, trace=None):
+    predicted_answer = pred.answer
+    context = pred.context
+    detail = dspy.ChainOfThought(GenerateAnswer)(
+        context="N/A", assessed_question="Is the answer detailed?", assessed_answer=predicted_answer
+    )
+    faithful = dspy.ChainOfThought(GenerateAnswer)(
+        context=context, assessed_question="Is it grounded in context?", assessed_answer=predicted_answer
+    )
+    overall = dspy.ChainOfThought(GenerateAnswer)(
+        context=context, assessed_question=f"Rate the answer: {predicted_answer}", assessed_answer=predicted_answer
+    )
+    total = float(detail.answer) + float(faithful.answer) * 2 + float(overall.answer)
+    return total / 5.0
+# Evaluate before optimization
+evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+evaluate(RAG(), metric=llm_metric)
+# Select Optimizer
+if args.optimizer == "bootstrap":
+    optimizer = BootstrapFewShotWithRandomSearch(
+        metric=llm_metric,
+        max_bootstrapped_demos=4,
+        max_labeled_demos=4,
+        max_rounds=1,
+        num_candidate_programs=2,
+        num_threads=2,
+    )
+else:
+    optimizer = BayesianSignatureOptimizer(
+        task_model=dspy.settings.lm, metric=llm_metric, prompt_model=dspy.settings.lm, n=5, verbose=False
+    )
+# Compile optimized RAG
+optimized_compiled_rag = optimizer.compile(RAG(), trainset=trainset)
+# Evaluate optimized RAG
+evaluate = Evaluate(metric=llm_metric, devset=devset, num_threads=1, display_progress=True, display_table=5)
+evaluate(optimized_compiled_rag)

src/rag_pipelines/pipelines/dspy/dspy_bootstrap_few_shot_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import argparse
+import dspy
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt import BootstrapFewShot
+from dspy_modules.evaluator import llm_metric
+from dspy_modules.rag import RAG
+from dspy_modules.weaviate_db import WeaviateVectorDB
+def main(args):
+    # Load dataset
+    earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+    questions = earnings_calls_data["question"]
+    # Split dataset
+    trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
+    # Initialize Weaviate VectorDB
+    weaviate_db = WeaviateVectorDB(
+        cluster_url=args.cluster_url, api_key=args.api_key, index_name=args.index_name, model_name=args.embedding_model
+    )
+    # Initialize LLM
+    llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
+    dspy.configure(lm=llm)
+    # Initialize and evaluate unoptimized RAG
+    uncompiled_rag = RAG(weaviate_db)
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(uncompiled_rag, metric=llm_metric)
+    # Optimize RAG using BootstrapFewShot
+    optimizer = BootstrapFewShot(metric=llm_metric)
+    optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
+    # Evaluate optimized RAG
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(optimized_compiled_rag)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
+    # Weaviate parameters
+    parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL")
+    parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key")
+    parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name")
+    parser.add_argument("--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model name")
+    # LLM parameters
+    parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="LLM API key")
+    parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM calls")
+    args = parser.parse_args()
+    main(args)

src/rag_pipelines/pipelines/dspy/dspy_copro_instruction_signature_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# https://github.com/weaviate/recipes/blob/main/integrations/llm-frameworks/dspy/1.Getting-Started-with-RAG-in-DSPy.ipynb
+import dspy
+import weaviate
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.primitives.prediction import Prediction
+from dspy.teleprompt import COPRO
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_weaviate.vectorstores import WeaviateVectorStore
+from weaviate.classes.init import Auth
+earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+questions = earnings_calls_data["question"]
+# Create the dspy datasets
+trainset = questions[:20]  # 20 examples for training
+devset = questions[20:30]  # 10 examples for development
+testset = questions[30:]  # 20 examples for testing
+trainset = [dspy.Example(question=question).with_inputs("question") for question in trainset]
+devset = [dspy.Example(question=question).with_inputs("question") for question in devset]
+testset = [dspy.Example(question=question).with_inputs("question") for question in testset]
+model_name = "jinaai/jina-embeddings-v3"
+task = "retrieval.query"
+model_kwargs = {"device": "cpu", "trust_remote_code": True}
+encode_kwargs = {"task": task, "prompt_name": task}
+embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
+weaviate_client = weaviate.connect_to_weaviate_cloud(
+    cluster_url="https://adrrwus9shkxkuijvazcrq.c0.us-west3.gcp.weaviate.cloud",
+    auth_credentials=Auth.api_key("J94gHySMWTWxggDDayGrF2ESGo23yOHZ1bUC"),
+)
+weaviate_db = WeaviateVectorStore(
+    index_name="LangChain_d73ad6159d514fec887456fa6db11e61",
+    embedding=embeddings,
+    client=weaviate_client,
+    text_key="text",
+)
+llm = dspy.LM(
+    "groq/llama-3.3-70b-versatile",
+    api_key="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",
+    num_retries=120,
+)
+dspy.configure(lm=llm)
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="short and precise answer")
+class RAG(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+    # This makes it possible to use the Langchain VectorDB integration and custom embeddings with SentenceTransformers
+    def retrieve(self, question):
+        results = weaviate_db.similarity_search(query=question)
+        passages = [res.page_content for res in results]
+        return Prediction(passages=passages)
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        prediction = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=prediction.answer)
+# Create an LLM as a Judge Evaluation Metric for evaluation of the RAG Pipelines
+# (Taken from weaviate recipe)
+class Assess(dspy.Signature):
+    """Assess the quality of an answer to a question."""
+    context = dspy.InputField(desc="The context for answering the question.")
+    assessed_question = dspy.InputField(desc="The evaluation criterion.")
+    assessed_answer = dspy.InputField(desc="The answer to the question.")
+    assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")
+def llm_metric(gold, pred, trace=None):
+    predicted_answer = pred.answer
+    context = pred.context
+    question = gold.question
+    print(f"Test Question: {question}")
+    print(f"Predicted Answer: {predicted_answer}")
+    detail = "Is the assessed answer detailed?"
+    faithful = (
+        "Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."
+    )
+    overall = f"Please rate how well this answer answers the question, `{question}` based on the context.\n `{predicted_answer}`"
+    detail = dspy.ChainOfThought(Assess)(context="N/A", assessed_question=detail, assessed_answer=predicted_answer)
+    faithful = dspy.ChainOfThought(Assess)(
+        context=context, assessed_question=faithful, assessed_answer=predicted_answer
+    )
+    overall = dspy.ChainOfThought(Assess)(context=context, assessed_question=overall, assessed_answer=predicted_answer)
+    print(f"Faithful: {faithful.assessment_answer}")
+    print(f"Detail: {detail.assessment_answer}")
+    print(f"Overall: {overall.assessment_answer}")
+    total = float(detail.assessment_answer) + float(faithful.assessment_answer) * 2 + float(overall.assessment_answer)
+    return total / 5.0
+# Evaluate our RAG Program before it is compiled
+evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+evaluate(RAG(), metric=llm_metric)
+# Optimize the RAG Program
+optimizer = COPRO(
+    prompt_model=dspy.settings.lm,
+    metric=llm_metric,
+    breadth=3,
+    depth=2,
+    init_temperature=0.25,
+    verbose=False,
+)
+optimized_compiled_rag = optimizer.compile(
+    RAG(),
+    trainset=trainset,
+    eval_kwargs={"num_threads": 1, "display_progress": True, "display_table": 0},
+)
+# Evaluate the optimized RAG Program
+evaluate = Evaluate(
+    metric=llm_metric,
+    devset=devset,
+    num_threads=1,
+    display_progress=True,
+    display_table=5,
+)
+evaluate(optimized_compiled_rag)

src/rag_pipelines/pipelines/dspy_baseline_rag.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import argparse
+import dspy
+from datasets import load_dataset
+from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
+from rag_pipelines.dspy.dspy_rag import DSPyRAG
+from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
+def main(cluster_url, api_key, index_name, model_name, llm_model, llm_api_key):
+    """Executes the DSPy-based Retrieval-Augmented Generation (RAG) pipeline.
+    This function:
+    1. Loads a dataset of earnings call Q&A pairs.
+    2. Prepares development (dev) and test datasets for evaluation.
+    3. Initializes a Weaviate vector database for storing and retrieving embeddings.
+    4. Configures a Large Language Model (LLM) with DSPy.
+    5. Instantiates and evaluates the RAG pipeline before optimization.
+    Args:
+        cluster_url (str): URL of the Weaviate vector database cluster.
+        api_key (str): API key for authenticating access to Weaviate.
+        index_name (str): Name of the Weaviate index for storing vectors.
+        model_name (str): Embedding model name for vectorization.
+        llm_model (str): Name of the LLM used for inference.
+        llm_api_key (str): API key for accessing the LLM.
+    """
+    # Load the earnings calls Q&A dataset (first 50 samples)
+    earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+    questions = earnings_calls_data["question"]
+    # Prepare dataset splits:
+    # - The first 20 questions are used for training (not explicitly utilized here).
+    # - The next 10 questions are used as the development set (devset) for evaluation.
+    # - The remaining questions are part of the test set (not used in this script).
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    # Initialize Weaviate VectorDB for embedding storage and retrieval
+    weaviate_db = WeaviateVectorDB(
+        cluster_url=cluster_url,  # Weaviate cluster URL
+        api_key=api_key,  # API key for authentication
+        index_name=index_name,  # Name of the index for vector storage
+        model_name=model_name,  # Embedding model used for vectorization
+    )
+    # Initialize the LLM with DSPy
+    llm = dspy.LM(llm_model, api_key=llm_api_key, num_retries=120)
+    dspy.configure(lm=llm)  # Set DSPy’s global LLM configuration
+    # Instantiate the RAG pipeline
+    rag = DSPyRAG(weaviate_db)
+    # Initialize evaluator for measuring LLM-based retrieval performance
+    evaluator = DSPyEvaluator()
+    # Evaluate the RAG pipeline before optimization
+    evaluate = dspy.Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(rag, metric=evaluator.llm_metric)
+if __name__ == "__main__":
+    """
+    Parses command-line arguments and runs the DSPy-based RAG pipeline.
+    """
+    parser = argparse.ArgumentParser(description="Run DSPy-based RAG pipeline")
+    # Weaviate configuration parameters
+    parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
+    parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
+    parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
+    parser.add_argument("--model_name", type=str, required=True, help="Embedding model name for vectorization.")
+    # LLM configuration parameters
+    parser.add_argument("--llm_model", type=str, required=True, help="LLM model name.")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM access.")
+    # Parse command-line arguments and execute the pipeline
+    args = parser.parse_args()
+    main(args.cluster_url, args.api_key, args.index_name, args.model_name, args.llm_model, args.llm_api_key)

src/rag_pipelines/pipelines/dspy_bayesian_signature_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import argparse
+import dspy
+import weaviate
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt import BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
+from langchain_huggingface import HuggingFaceEmbeddings
+from weaviate.classes.init import Auth
+from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
+from rag_pipelines.dspy.dspy_rag import DSPyRAG
+from rag_pipelines.vectordb.weaviate import WeaviateVectorStore
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description="Optimize and evaluate RAG pipeline with DSPy.")
+    # Dataset Arguments
+    parser.add_argument(
+        "--dataset_name", type=str, default="lamini/earnings-calls-qa", help="Name of the dataset to use."
+    )
+    parser.add_argument("--dataset_size", type=int, default=50, help="Number of examples to load from the dataset.")
+    # Weaviate Configuration
+    parser.add_argument("--weaviate_url", type=str, required=True, help="Weaviate cloud cluster URL.")
+    parser.add_argument("--weaviate_api_key", type=str, required=True, help="API key for Weaviate.")
+    parser.add_argument("--index_name", type=str, required=True, help="Index name in Weaviate.")
+    parser.add_argument(
+        "--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model for Weaviate."
+    )
+    # LLM Configuration
+    parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model to use.")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM.")
+    # Optimization Method
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        choices=["bootstrap", "bayesian"],
+        default="bootstrap",
+        help="Choose the optimization method.",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # Load dataset
+    dataset = load_dataset(args.dataset_name, split=f"train[:{args.dataset_size}]")
+    questions = dataset["question"]
+    # Create DSPy datasets
+    trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    testset = [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
+    # Initialize embeddings
+    model_kwargs = {"device": "cpu", "trust_remote_code": True}
+    encode_kwargs = {"task": "retrieval.query", "prompt_name": "retrieval.query"}
+    embeddings = HuggingFaceEmbeddings(
+        model_name=args.embedding_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+    )
+    # Connect to Weaviate
+    weaviate_client = weaviate.connect_to_weaviate_cloud(
+        cluster_url=args.weaviate_url,
+        auth_credentials=Auth.api_key(args.weaviate_api_key),
+    )
+    WeaviateVectorStore(
+        index_name=args.index_name,
+        embedding=embeddings,
+        client=weaviate_client,
+        text_key="text",
+    )
+    # Configure LLM
+    llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=120)
+    dspy.configure(lm=llm)
+    # Evaluate before optimization
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(DSPyRAG(), metric=DSPyEvaluator.llm_metric())
+    # Select Optimizer
+    if args.optimizer == "bootstrap":
+        optimizer = BootstrapFewShotWithRandomSearch(
+            metric=DSPyEvaluator.llm_metric(),
+            max_bootstrapped_demos=4,
+            max_labeled_demos=4,
+            max_rounds=1,
+            num_candidate_programs=2,
+            num_threads=2,
+        )
+    else:
+        optimizer = BayesianSignatureOptimizer(
+            task_model=dspy.settings.lm,
+            metric=DSPyEvaluator.llm_metric(),
+            prompt_model=dspy.settings.lm,
+            n=5,
+            verbose=False,
+        )
+    # Compile optimized RAG
+    optimized_compiled_rag = optimizer.compile(DSPyRAG(), testset=testset, trainset=trainset)
+    # Evaluate optimized RAG
+    evaluate = Evaluate(
+        metric=DSPyEvaluator.llm_metric(), devset=devset, num_threads=1, display_progress=True, display_table=5
+    )
+    evaluate(optimized_compiled_rag)
+if __name__ == "__main__":
+    main()

src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import argparse
+import dspy
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt import BootstrapFewShot
+from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
+from rag_pipelines.dspy.dspy_rag import DSPyRAG
+from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
+def main(args):
+    """Runs the DSPy RAG optimization pipeline.
+    This function:
+    1. Loads the earnings calls dataset.
+    2. Splits the dataset into training, development, and test sets.
+    3. Initializes a Weaviate vector database and an LLM.
+    4. Evaluates an unoptimized RAG pipeline.
+    5. Optimizes the RAG pipeline using BootstrapFewShot.
+    6. Evaluates the optimized RAG pipeline.
+    Args:
+        args (argparse.Namespace): Command-line arguments for configuring the pipeline.
+    """
+    # Load the dataset (Earnings Calls QA dataset)
+    earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+    questions = earnings_calls_data["question"]
+    # Split the dataset into training (20), development (10), and test sets
+    trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]  # Test set (not used in this script)
+    # Initialize Weaviate VectorDB for storing and retrieving embeddings
+    weaviate_db = WeaviateVectorDB(
+        cluster_url=args.cluster_url,  # URL of the Weaviate cluster
+        api_key=args.api_key,  # API key for authentication
+        index_name=args.index_name,  # Name of the Weaviate index
+        model_name=args.embedding_model,  # Embedding model to use for vector storage
+    )
+    # Initialize LLM with DSPy
+    llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
+    dspy.configure(lm=llm)  # Set DSPy's global configuration for LLM usage
+    # Initialize the unoptimized RAG pipeline
+    uncompiled_rag = DSPyRAG(weaviate_db)
+    # Evaluate the unoptimized RAG pipeline using the development set
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(uncompiled_rag, metric=DSPyEvaluator.llm_metric())
+    # Optimize the RAG pipeline using BootstrapFewShot
+    optimizer = BootstrapFewShot(metric=DSPyEvaluator.llm_metric())
+    # Compile an optimized version of the RAG model using the training set
+    optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
+    # Evaluate the optimized RAG pipeline
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(optimized_compiled_rag)
+if __name__ == "__main__":
+    """
+    Parses command-line arguments and runs the main function.
+    """
+    parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
+    # Weaviate parameters (for vector storage and retrieval)
+    parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
+    parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
+    parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
+    parser.add_argument(
+        "--embedding_model",
+        type=str,
+        default="jinaai/jina-embeddings-v3",
+        help="Embedding model used for document vectorization.",
+    )
+    # LLM parameters (for DSPy-based language model inference)
+    parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name.")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="API key for accessing the LLM service.")
+    parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM API calls.")
+    # Parse command-line arguments and execute the pipeline
+    args = parser.parse_args()
+    main(args)

src/rag_pipelines/pipelines/dspy_bootstrap_few_shot_random_search_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import argparse
+import dspy
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt import BootstrapFewShotWithRandomSearch
+from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
+from rag_pipelines.dspy.dspy_rag import DSPyRAG
+from rag_pipelines.vectordb.weaviate import WeaviateVectorDB
+def main(args):
+    """Main function to run the DSPy RAG optimization pipeline.
+    This function loads a dataset, initializes a Weaviate vector database and an LLM,
+    evaluates an unoptimized RAG pipeline, optimizes it using BootstrapFewShotWithRandomSearch,
+    and then evaluates the optimized pipeline.
+    Args:
+        args (argparse.Namespace): Command-line arguments for configuring the pipeline.
+    """
+    # Load dataset (Earnings Calls QA)
+    earnings_calls_data = load_dataset("lamini/earnings-calls-qa", split="train[:50]")
+    questions = earnings_calls_data["question"]
+    # Split dataset into training, development, and test sets
+    trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]  # Test set (not used here)
+    # Initialize Weaviate Vector Database
+    weaviate_db = WeaviateVectorDB(
+        cluster_url=args.cluster_url,
+        api_key=args.api_key,
+        index_name=args.index_name,
+        model_name=args.embedding_model,
+    )
+    # Initialize the LLM
+    llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=args.num_retries)
+    dspy.configure(lm=llm)  # Set DSPy's global LLM configuration
+    # Initialize the unoptimized RAG pipeline
+    uncompiled_rag = DSPyRAG(weaviate_db)
+    # Evaluate the unoptimized RAG model
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(uncompiled_rag, metric=DSPyEvaluator.llm_metric())
+    # Optimize RAG using BootstrapFewShotWithRandomSearch
+    optimizer = BootstrapFewShotWithRandomSearch(
+        metric=DSPyEvaluator.llm_metric(),
+        max_bootstrapped_demos=args.max_bootstrapped_demos,
+        max_labeled_demos=args.max_labeled_demos,
+        max_rounds=args.max_rounds,
+        num_candidate_programs=args.num_candidate_programs,
+        num_threads=args.num_threads,
+        num_threads=args.num_threads,
+    )
+    # Compile an optimized version of the RAG model
+    optimized_compiled_rag = optimizer.compile(uncompiled_rag, trainset=trainset)
+    # Evaluate the optimized RAG model
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(optimized_compiled_rag)
+if __name__ == "__main__":
+    """
+    Parses command-line arguments and runs the main function.
+    """
+    parser = argparse.ArgumentParser(description="DSPy RAG Optimization Pipeline")
+    # Weaviate parameters
+    parser.add_argument("--cluster_url", type=str, required=True, help="Weaviate cluster URL.")
+    parser.add_argument("--api_key", type=str, required=True, help="Weaviate API key.")
+    parser.add_argument("--index_name", type=str, required=True, help="Weaviate index name.")
+    parser.add_argument(
+        "--embedding_model",
+        type=str,
+        default="jinaai/jina-embeddings-v3",
+        help="Embedding model to use for vector retrieval.",
+    )
+    # LLM parameters
+    parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model name.")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="API key for accessing the LLM.")
+    parser.add_argument("--num_retries", type=int, default=120, help="Number of retries for LLM calls.")
+    # Optimization parameters
+    parser.add_argument("--max_bootstrapped_demos", type=int, default=4, help="Max bootstrapped demonstrations.")
+    parser.add_argument("--max_labeled_demos", type=int, default=4, help="Max labeled demonstrations.")
+    parser.add_argument("--max_rounds", type=int, default=1, help="Max optimization rounds.")
+    parser.add_argument("--num_candidate_programs", type=int, default=2, help="Number of candidate programs.")
+    parser.add_argument("--num_threads", type=int, default=2, help="Number of threads for optimization.")
+    # Parse arguments and run the main function
+    args = parser.parse_args()
+    main(args)

src/rag_pipelines/pipelines/dspy_copro_instruction_signature_optimization_rag.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import argparse
+import dspy
+import weaviate
+from datasets import load_dataset
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt import COPRO
+from langchain_huggingface import HuggingFaceEmbeddings
+from weaviate.classes.init import Auth
+from rag_pipelines.dspy.dspy_evaluator import DSPyEvaluator
+from rag_pipelines.dspy.dspy_rag import DSPyRAG
+from rag_pipelines.vectordb.weaviate import WeaviateVectorStore
+def parse_args():
+    """Parse command-line arguments for the DSPy RAG pipeline with Weaviate and LLM evaluation.
+    Returns:
+        argparse.Namespace: The parsed command-line arguments.
+    """
+    parser = argparse.ArgumentParser(description="Run DSPy RAG pipeline with Weaviate and LLM evaluation.")
+    # Dataset Arguments
+    parser.add_argument(
+        "--dataset_name", type=str, default="lamini/earnings-calls-qa", help="Name of the dataset to use."
+    )
+    parser.add_argument("--dataset_size", type=int, default=50, help="Number of examples to load from the dataset.")
+    # Weaviate Configuration
+    parser.add_argument("--weaviate_url", type=str, required=True, help="Weaviate cloud cluster URL.")
+    parser.add_argument("--weaviate_api_key", type=str, required=True, help="API key for Weaviate.")
+    parser.add_argument("--index_name", type=str, required=True, help="Index name in Weaviate.")
+    parser.add_argument(
+        "--embedding_model", type=str, default="jinaai/jina-embeddings-v3", help="Embedding model for Weaviate."
+    )
+    # LLM Configuration
+    parser.add_argument("--llm_model", type=str, default="groq/llama-3.3-70b-versatile", help="LLM model to use.")
+    parser.add_argument("--llm_api_key", type=str, required=True, help="API key for LLM.")
+    return parser.parse_args()
+def main():
+    """Run the DSPy RAG pipeline, including dataset loading, embedding initialization, Weaviate connection, LLM evaluation, and model optimization.
+    This function orchestrates the entire pipeline from loading data, preparing datasets,
+    connecting to Weaviate, initializing embeddings, evaluating the model, and optimizing the RAG pipeline.
+    """
+    # Parse command-line arguments
+    args = parse_args()
+    # Load dataset from Hugging Face and extract questions
+    dataset = load_dataset(args.dataset_name, split=f"train[:{args.dataset_size}]")
+    questions = dataset["question"]
+    # Create DSPy datasets for training and evaluation
+    trainset = [dspy.Example(question=q).with_inputs("question") for q in questions[:20]]
+    devset = [dspy.Example(question=q).with_inputs("question") for q in questions[20:30]]
+    [dspy.Example(question=q).with_inputs("question") for q in questions[30:]]
+    # Initialize HuggingFace embeddings for retrieval tasks
+    model_kwargs = {"device": "cpu", "trust_remote_code": True}
+    encode_kwargs = {"task": "retrieval.query", "prompt_name": "retrieval.query"}
+    embeddings = HuggingFaceEmbeddings(
+        model_name=args.embedding_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+    )
+    # Connect to Weaviate using the provided URL and API key
+    weaviate_client = weaviate.connect_to_weaviate_cloud(
+        cluster_url=args.weaviate_url,
+        auth_credentials=Auth.api_key(args.weaviate_api_key),
+    )
+    # Initialize Weaviate vector store with the embeddings and client connection
+    WeaviateVectorStore(
+        index_name=args.index_name,
+        embedding=embeddings,
+        client=weaviate_client,
+        text_key="text",
+    )
+    # Initialize the LLM (Language Model) with the specified model and API key
+    llm = dspy.LM(args.llm_model, api_key=args.llm_api_key, num_retries=120)
+    dspy.configure(lm=llm)
+    # Evaluate the initial RAG pipeline
+    evaluate = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    evaluate(DSPyRAG(), metric=DSPyEvaluator.llm_metric())
+    # Optimize the RAG model using COPRO (Collaborative Prompt Optimization)
+    optimizer = COPRO(
+        prompt_model=dspy.settings.lm,
+        metric=DSPyEvaluator.llm_metric(),
+        breadth=3,
+        depth=2,
+        init_temperature=0.25,
+        verbose=False,
+    )
+    # Compile the optimized RAG model with the training set
+    optimized_compiled_rag = optimizer.compile(
+        DSPyRAG(),
+        trainset=trainset,
+        eval_kwargs={"num_threads": 1, "display_progress": True, "display_table": 0},
+    )
+    # Evaluate the optimized model on the development set
+    evaluate = Evaluate(
+        metric=DSPyEvaluator.llm_metric(),
+        devset=devset,
+        num_threads=1,
+        display_progress=True,
+        display_table=5,
+    )
+    evaluate(optimized_compiled_rag)
+if __name__ == "__main__":
+    main()

src/rag_pipelines/pipelines/dspy_rag.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Any
+import dspy
+import weave
+from dspy import LM, Module
+class DSPyRAGPipeline(weave.Model):
+    """A class representing a Retrieval-Augmented Generation (RAG) model using DSPy.
+    Attributes:
+        llm (LM): The language model used for generating predictions.
+        rag_module (Module): The module used for retrieval tasks.
+    """
+    llm: LM
+    rag_module: Module
+    def __init__(self, llm: LM, rag_module: Module) -> None:
+        """Initialize the DSPyRAG model.
+        Args:
+            llm (LM): The language model to be used.
+            rag_module (Module): The module to be used for retrieval tasks.
+        """
+        super().__init__(llm=llm, rag_module=rag_module)
+        self.llm = llm
+        self.rag_module = rag_module
+        dspy.configure(lm=llm)
+    @weave.op()
+    def predict(self, input: str) -> dict[str, Any]:
+        """Predicts the answer to a given question using the RAG approach.
+        Args:
+            input (str): The question to be answered.
+        Returns:
+            Dict[str, Any]: A dictionary containing the answer and the context.
+                - "answer" (str): The predicted answer to the question.
+                - "context" (Any): The context used by the RAG module.
+        """
+        prediction = self.rag_module(input)
+        return {"output": prediction.answer, "retrieval_context": prediction.retrieval_context}

src/rag_pipelines/pipelines/dspy_rag_module.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import Any
+from dspy import ChainOfThought, Module, Prediction
+from rag_pipelines.evaluation import retrieval
+from rag_pipelines.prompts import GenerateAnswerFromContext
+class RAG(Module):
+    """Retrieval-Augmented Generation (RAG) module that retrieves context based on a question and generates an answer using that context."""
+    def __init__(self, retriever: Any):
+        """Initialize the RAG module.
+        Args:
+            retriever (Any): An object that has a `question` method returning
+                a `passages` attribute. Typically, this would be a retriever like
+                a Milvus Retriever.
+        """
+        super().__init__()
+        self.retrieve = retriever
+        self.generate_answer = ChainOfThought(GenerateAnswerFromContext)
+    def forward(self, question: str) -> Prediction:
+        """Process a question by retrieving context and generating an answer.
+        Args:
+            question (str): The question to be answered.
+        Returns:
+            Prediction: A Prediction object containing the context and the generated answer.
+        """
+        # Use the retriever to get context for the question.
+        context = self.retrieve(question).passages
+        # Generate an answer using the retrieved context and the question.
+        prediction = self.generate_answer(context=context, question=question)
+        # Return a Prediction object with the context and answer.
+        return Prediction(retrieval_context=[item.long_text for item in context], answer=prediction.answer)

src/rag_pipelines/pipelines/rag.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+from typing import Any, Optional
+import weave
+from langchain_community.retrievers import PineconeHybridSearchRetriever
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts.chat import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_groq import ChatGroq
+from weave import Model
+from weave.integrations.langchain import WeaveTracer
+# Disable global tracing explicitly
+os.environ["WEAVE_TRACE_LANGCHAIN"] = "false"
+class RAGPipeline(Model):
+    """A hybrid retrieval-augmented generation (RAG) pipeline using Weave for tracing and LangChain components.
+    This pipeline integrates a retriever, prompt template, and language model (LLM) to implement a retrieval-augmented
+    generation system, where the LLM generates answers based on both retrieved documents and a prompt template.
+    Weave is used for tracing to monitor the pipeline's execution.
+    Attributes:
+        retriever: The retrieval model used to fetch relevant documents based on a query.
+        prompt: The prompt template to generate questions for the LLM.
+        llm: The language model used to generate responses.
+        tracer: The tracer used to record the execution details with Weave.
+        tracing_project_name: The name of the Weave project for tracing.
+    """
+    retriever: Optional[PineconeHybridSearchRetriever] = None
+    prompt: Optional[ChatPromptTemplate] = None
+    llm: Optional[ChatGroq] = None
+    tracing_project_name: str
+    weave_params: dict[str, Any]
+    tracer: Optional[WeaveTracer] = None
+    def __init__(self, retriever, prompt, llm, tracing_project_name="hybrid_rag", weave_params=None):
+        """Initialize the HybridRAGPipeline.
+        This constructor sets up the retriever, prompt, LLM, and integrates Weave tracing if specified.
+        Args:
+            retriever: The retrieval model used to fetch documents for the RAG pipeline.
+            prompt: The prompt template used to create questions for the LLM.
+            llm: The language model used for response generation based on retrieved documents and prompt.
+            tracing_project_name (str): The name of the Weave project for tracing. Defaults to "hybrid_rag".
+            weave_params (dict): Additional parameters for initializing Weave. This can include configuration
+                                    details or authentication settings for the Weave service.
+        """
+        super().__init__(
+            retriever=retriever,
+            prompt=prompt,
+            llm=llm,
+            tracing_project_name=tracing_project_name,
+            weave_params=weave_params,
+        )
+        if weave_params is None:
+            weave_params = {}
+        self.retriever = retriever
+        self.prompt = prompt
+        self.llm = llm
+        self.tracing_project_name = tracing_project_name
+        # Initialize Weave tracing if parameters are provided, otherwise default initialization.
+        if weave_params:
+            self._initialize_weave(**weave_params)
+        else:
+            self._initialize_weave()
+    def _initialize_weave(self, **weave_params):
+        """Initialize Weave with the specified tracing project name.
+        This method sets up the Weave environment and creates an instance of the WeaveTracer.
+        The tracer records the execution of each step in the RAG pipeline for monitoring and debugging purposes.
+        """
+        # Initialize the Weave project
+        weave.init(self.tracing_project_name, **weave_params)
+        # Set up the tracer for tracking pipeline execution
+        self.tracer = WeaveTracer()
+    @weave.op()
+    def predict(self, question: str) -> str:
+        """Execute the Hybrid RAG pipeline with the given question.
+        This method orchestrates the entire RAG pipeline. It first retrieves documents using the retriever,
+        formats them, generates a question using the prompt template, and then processes the final response
+        using the LLM. The process is traced using Weave for debugging and monitoring.
+        Args:
+            question (str): The input question to be answered by the pipeline.
+        Returns:
+            str: The answer generated by the LLM based on the retrieved documents and the question prompt.
+        """
+        # Configuration for trace callbacks to record the execution process
+        config = {"callbacks": [self.tracer]}
+        # Set up the RAG pipeline chain with document retrieval, formatting, prompting, LLM, and output parsing
+        rag_chain = (
+            {
+                "context": self.retriever | self.format_docs,
+                "question": RunnablePassthrough(),
+            }
+            | self.prompt
+            | self.llm
+            | StrOutputParser()
+        )
+        # Invoke the pipeline with the specified question and configuration
+        return rag_chain.invoke(question, config=config)
+    def format_docs(self, docs):
+        """Format retrieved documents into a string for input to the LLM.
+        The documents are formatted with information such as filing date, accession number, summary, and image
+        descriptions.
+        This string will be passed as the context for the LLM to generate a response.
+        Args:
+            docs (list): A list of document objects that have been retrieved based on the input question.
+        Returns:
+            str: A formatted string of document contents, joined by newline characters.
+        """
+        context = ""
+        for doc in docs:
+            date = doc.metadata["filing_date"]
+            accession_no = doc.metadata["accession_no"]
+            summary = doc.metadata["summary"]
+            image_descriptions = doc.metadata["image_descriptions"]
+            context += (
+                f"""# Report {accession_no} filed on {date}:\n\n## An excerpt from the report"""
+                f"""\n\n{doc.page_content}\n\n"""
+            )
+            if len(image_descriptions) > 0:
+                context += f"""## Image descriptions\n\n{image_descriptions}\n\n"""
+            context += (
+                f"""## Summary of the report\n\nHere's a summary of the report along with the some """
+                f"""important keywords and phrases present in the report:\n\n{summary}\n\n"""
+            )
+        return context