File size: 14,267 Bytes
934d3e0
1603571
934d3e0
 
 
 
16fd4fe
1603571
 
 
934d3e0
 
 
1603571
16fd4fe
934d3e0
1603571
934d3e0
 
e37d064
934d3e0
1603571
 
 
 
934d3e0
 
 
 
 
 
 
e37d064
1603571
 
 
 
934d3e0
1603571
 
934d3e0
 
 
1603571
 
 
934d3e0
 
 
 
 
 
 
 
 
 
 
 
1603571
 
 
934d3e0
1603571
934d3e0
 
 
 
 
 
 
 
 
1603571
 
 
934d3e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1603571
 
 
 
 
 
 
 
934d3e0
 
 
 
 
 
 
1603571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1392a
1603571
 
 
 
 
 
16fd4fe
1603571
 
 
 
 
934d3e0
e37d064
 
1603571
 
 
 
 
 
 
 
16fd4fe
 
 
 
 
 
 
 
1603571
9e00936
1603571
 
 
 
3d1392a
1603571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e37d064
 
1603571
 
2c56f66
1603571
2c56f66
1603571
 
 
934d3e0
 
 
 
1603571
934d3e0
 
1603571
934d3e0
 
 
1603571
 
934d3e0
1603571
934d3e0
1603571
 
 
 
934d3e0
1603571
 
934d3e0
 
 
1603571
 
 
934d3e0
 
 
 
1603571
 
934d3e0
1603571
 
934d3e0
1603571
 
 
 
 
 
 
 
 
934d3e0
 
 
 
 
1603571
 
 
 
 
 
 
 
 
 
 
 
 
 
16fd4fe
 
 
 
 
1603571
 
 
 
 
e37d064
 
 
 
1603571
 
 
 
 
 
 
934d3e0
 
 
 
 
 
 
1603571
 
 
9e00936
1603571
 
 
 
 
 
 
 
3d1392a
934d3e0
e37d064
934d3e0
e37d064
 
 
 
 
934d3e0
1603571
934d3e0
1603571
9e00936
1603571
 
934d3e0
 
1603571
 
934d3e0
9e00936
1603571
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
import asyncio
import os
import time
from collections import OrderedDict
from functools import partial
from textwrap import dedent
from typing import ClassVar, Dict, List, Optional, Union

import pandas as pd
from langchain_core import outputs
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langsmith import Client, evaluate, evaluation
from langsmith.evaluation import EvaluationResult, aevaluate, run_evaluator
from langsmith.evaluation.evaluator import DynamicRunEvaluator, EvaluationResults
from langsmith.schemas import Dataset
from langsmith.utils import LangSmithError
from pandas._libs.tslibs.np_datetime import py_td64_to_tdstruct
from pandas.core.dtypes.dtypes import re
from pydantic import BaseModel, ConfigDict, Field
from ragas import SingleTurnSample
from ragas.llms.base import LangchainLLMWrapper

from src.config import Config, load_spreadsheet
from src.rag import (
    ChromaSlideStore,
    HyperbolicScorer,
    MinScorer,
    PresentationRetriever,
    ScorerTypes,
)
from src.rag.storage import LLMPresentationRetriever


@run_evaluator
def presentation_match(run, example) -> EvaluationResult:
    """Evaluator for checking if top-1 retrieved presentation matches ground truth
    Scoring: 1 if match else 0
    """
    best_pres_info = run.outputs["contexts"][0]
    best_pres_name = best_pres_info["pres_name"]
    match = int(best_pres_name == example.outputs["pres_name"])
    return EvaluationResult(key="presentation_match", score=match)


@run_evaluator
def presentation_found(run, example) -> EvaluationResult:
    """Evaluator for checking whether ground truth presentation
    is present in top-k retrieved.

    Scoring: 1 if present else 0
    """
    found_pres_names = [c["pres_name"] for c in run.outputs["contexts"]]
    score = int(example.outputs["pres_name"] in found_pres_names)
    return EvaluationResult(key="presentation_found", score=score)


@run_evaluator
def page_match(run, example) -> EvaluationResult:
    """Evaluator for checking if retrieved pages match ground truth
    Scoring: 1 if best page matches the specified else 0
    """
    score = 0
    for pres_info in run.outputs["contexts"]:
        best_page_found = pres_info["pages"][0]
        if pres_info["pres_name"] == example.outputs["pres_name"]:
            reference_pages = example.outputs["pages"]
            if not reference_pages:  # Length is 0
                score = 1
            elif best_page_found in reference_pages:
                score = 1
    return EvaluationResult(key="page_match", score=score)


@run_evaluator
def page_found(run, example) -> EvaluationResult:
    """Evaluator for checking whether ground truth presentation
    is present in top-k retrieved.

    Scoring: 1 if present else 0
    """
    score = 0
    for pres_info in run.outputs["contexts"]:
        pages_found = pres_info["pages"]

        # Count for the presentation which matches ground truth. Even if it is not top-1
        if pres_info["pres_name"] == example.outputs["pres_name"]:
            reference_pages = example.outputs["pages"]
            if not reference_pages:  # Length is 0
                score = 1
            elif not set(reference_pages) - set(pages_found):
                score = 1
    return EvaluationResult(key="page_found", score=score)


@run_evaluator
def n_pages(run, example) -> EvaluationResult:
    pres_info = run.outputs["contexts"][0]
    n_pgs = len(pres_info["pages"])
    return EvaluationResult(key="n_pages", score=n_pgs)


@run_evaluator
def n_pres(run, example) -> EvaluationResult:
    n = len(run.outputs["contexts"])
    return EvaluationResult(key="n_pres", score=n)


def create_llm_relevance_evaluator(llm, n_contexts: int = -1):
    class RelevanceOutput(BaseModel):
        explanation: str = Field(description="Explanation for the relevance score")
        relevance_score: int = Field(description="Relevance score (0 or 1)")

    prompt_template = PromptTemplate.from_template(
        """\
You will act as an expert relevance assessor for a presentation retrieval system. Your task is to evaluate whether the retrieved slide descriptions contain relevant information for the user's query. Consider both textual content and references to visual elements (images, charts, graphs) as equally valid sources of information.

Evaluation Rules:
- Assign score 1 if the descriptions contain ANY relevant information that helps answer the query
- Assign score 0 only if the descriptions are completely unrelated or provide no useful information
- Treat references to visual elements (e.g., "graph shows increasing trend" or "image depicts workflow") as valid information
- Consider partial matches as relevant (score 1) as long as they provide some value in answering the query

For each evaluation, you will receive:
1. The user's query
2. Retrieved slide descriptions

# Query
{query}

--- END OF QUERY ---

# Slide Descriptions
{context}

--- END OF SLIDE DESCRIPTIONS ---

Format output as JSON:

```json
{{
  "explanation": string, # Clear justification explaining why the content is relevant or irrelevant
  "relevance_score": int  # 1 if any relevant information is found, 0 if completely irrelevant
}}
```
"""
    )

    llm = Config().model_config.load_vsegpt(model="openai/gpt-4o-mini")
    chain = (
        prompt_template
        | llm
        | StrOutputParser()
        | JsonOutputParser(pydantic_object=RelevanceOutput)
    )

    @run_evaluator
    def llm_relevance(run, example) -> EvaluationResult:
        # print(run.inputs)
        time.sleep(1.05)
        question = run.inputs["inputs"]["question"]
        pres = run.outputs["contexts"][0]

        contexts_used = (
            pres["contexts"] if n_contexts <= 0 else pres["contexts"][:n_contexts]
        )
        pres_context = "\n\n---\n".join(contexts_used)
        llm_out = chain.invoke(dict(query=question, context=pres_context))
        return EvaluationResult(
            key="llm_relevance",
            score=llm_out["relevance_score"],
            comment=llm_out["explanation"],
        )

    return llm_relevance


def create_ragas_evaluator(metric):
    """Factory function for RAGAS metric evaluators

    Args:
        metric: Initialized RAGAS metric with LLM

    Returns:
        Evaluator function compatible with LangSmith

    Example:
      >>> from ragas.metric import AnswerCorrectness, AnswerRelevancy, ContextPrecision, ContextRecall, Faithfulness,
      >>> llm = LangchainLLMWrapper(Config().load_vsegpt())
      >>> metric = AnswerRelevancy(llm=llm, embeddings=embedding_model)
      >>> evaluator = create_ragas_evaluator(metric)
      >>> evaluate(dataset_id=..., evaluators=[evaluator])
    """

    @run_evaluator
    async def evaluate(run, example) -> EvaluationResult:
        sample = SingleTurnSample(
            user_input=example.inputs["question"],
            response=run.outputs["answer"],
            retrieved_contexts=run.outputs["contexts"],
        )
        score = await metric.single_turn_ascore(sample)
        return EvaluationResult(key=metric.name, score=score)

    return evaluate


class LangsmithConfig(BaseModel):
    """Configuration for RAG evaluation"""

    dataset_name: str = "RAG_test"

    # Configure Retrieval
    scorers: List[ScorerTypes] = [MinScorer(), HyperbolicScorer()]
    retriever: Union[PresentationRetriever, LLMPresentationRetriever]

    # Setup Evaluators
    evaluators: List[DynamicRunEvaluator] = [presentation_match, page_match]

    # Configure RAGAS
    # ragas_metrics: List[type] = [Faithfulness]  # List of metric classes
    n_contexts: int = 10
    n_pages: int = 3

    # Configure evaluation
    max_concurrency: int = 2
    experiment_prefix: Optional[str] = None
    sheet_id: Optional[str] = os.environ.get("BENCHMARK_SPREADSHEET_ID")

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __post_init__(self):
        self.retriever.n_contexts = self.n_contexts
        self.retriever.n_pages = self.n_pages

    def get_scored_retriever(self, scorer: ScorerTypes):
        self.retriever.set_scorer(scorer)
        return self.retriever


class RAGEvaluatorLangsmith:
    """Evaluator for RAG pipeline using LangSmith"""

    def __init__(
        self,
        config: LangsmithConfig,
        llm: ChatOpenAI = Config().model_config.load_vsegpt(model="openai/gpt-4o-mini"),
    ):
        # Enable LangSmith tracing
        os.environ["LANGCHAIN_TRACING_V2"] = os.environ.get(
            "LANGCHAIN_TRACING_V2", "true"
        )
        os.environ["LANGCHAIN_ENDPOINT"] = os.environ.get(
            "LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com"
        )
        os.environ["LANGCHAIN_PROJECT"] = os.environ.get(
            "LANGCHAIN_PROJECT", "presentation_rag"
        )

        # Setup class
        self.client = Client()
        self.config = config
        self.llm = llm
        self.llm_wrapped = LangchainLLMWrapper(self.llm)

    @classmethod
    def load_questions_from_sheet(cls, *args, **kwargs) -> pd.DataFrame:
        """Load evaluation questions from Google Sheets and preprocess dataset"""
        df = load_spreadsheet(*args, **kwargs)
        df.fillna(dict(page=""), inplace=True)
        return df

    def create_dataset(self, dataset_name: str, df: pd.DataFrame) -> Dataset:
        dataset = self.client.create_dataset(dataset_name=dataset_name)
        self.fill_dataset(dataset_name, df)
        return dataset

    def fill_dataset(self, dataset_name, df: pd.DataFrame):
        examples = dict(inputs=[], outputs=[], metadata=[])
        for _, row in df.iterrows():
            examples["inputs"].append(dict(question=row["question"]))
            examples["outputs"].append(
                dict(
                    pres_name=row["pres_name"],
                    pages=[int(x) if x else -1 for x in row["page"].split(",")],
                )
            )
            examples["metadata"].append(dict(content=row["content"]))

        self.client.create_examples(
            inputs=examples["inputs"],
            outputs=examples["outputs"],
            metadata=examples["metadata"],
            dataset_name=dataset_name,
        )

    def load_dataset(self, dataset_name: str):
        return self.client.read_dataset(dataset_name=dataset_name)

    def create_or_load_dataset(self, df: Optional[pd.DataFrame] = None) -> Dataset:
        """Create or load evaluation dataset in LangSmith"""
        # See if dataset with this name already exists
        dataset_names = [d.name for d in self.client.list_datasets()]
        if self.config.dataset_name in dataset_names:
            self.dataset = self.load_dataset(self.config.dataset_name)
            print(f"Using existing dataset: {self.dataset.name}")
            return self.dataset
        else:  # Create new dataset otherwise
            if df is not None:
                self.dataset = self.create_dataset(
                    dataset_name=self.config.dataset_name, df=df
                )
                print(f"Created new dataset: {self.dataset.name}")
                return self.dataset
            raise ValueError("No dataset provided")

    def _build_evaluator_chains(self) -> Dict:
        chains = {e._name: e for e in self.config.evaluators}

        # For ragas metrics
        # embedding_model = self.storage._embeddings
        # for metric_cls in self.config.ragas_metrics:
        #     metric = metric_cls(llm=self.llm, embeddings=embedding_model)
        #     evaluator = create_ragas_evaluator(metric)
        #     chains[metric_cls.name] = evaluator

        return chains

    def run_evaluation(self) -> None:
        """Run evaluation for all configured scorers"""
        chains = self._build_evaluator_chains()
        # exp_suffix = str(uuid.uuid4())[:6]

        for scorer in self.config.scorers:
            if self.config.experiment_prefix:
                experiment_prefix = f"{self.config.experiment_prefix}_{scorer.id}"
            else:
                experiment_prefix = f"{scorer.id}"

            retriever = self.config.get_scored_retriever(scorer)

            # async def do_retrieve(*args, **kwargs):
            #     return await retriever.aretrieve(*args, **kwargs)

            evaluate(
                retriever,
                experiment_prefix=experiment_prefix,
                data=self.config.dataset_name,
                evaluators=list(chains.values()),
                metadata=dict(
                    scorer=scorer.id,
                    retriever=self.config.retriever.__class__.__name__,
                ),
                max_concurrency=self.config.max_concurrency,
            )


def main():
    from dotenv import load_dotenv

    from src.rag.score import (
        ExponentialScorer,
        ExponentialWeightedScorer,
        HyperbolicScorer,
        HyperbolicWeightedScorer,
        MinScorer,
    )

    # Load env variables
    load_dotenv()
    os.environ["LANGCHAIN_TRACING_V2"] = "true"

    # Setup llm and embeddings
    project_config = Config()
    llm = project_config.model_config.load_vsegpt(model="openai/gpt-4o-mini")
    embeddings = project_config.embedding_config.load_vsegpt()

    # Initialize components
    storage = ChromaSlideStore(collection_name="pres0", embedding_model=embeddings)
    eval_config = LangsmithConfig(
        dataset_name="PresRetrieve_5",
        retriever_cls=LLMPresentationRetriever,
        evaluators=[
            presentation_match,
            presentation_found,
            page_match,
            page_found,
            # create_llm_relevance_evaluator(llm),
        ],
        scorers=[MinScorer(), ExponentialScorer()],
        max_concurrency=1,
    )
    evaluator = RAGEvaluatorLangsmith(storage=storage, config=eval_config, llm=llm)

    # Load questions if needed
    # sheet_id = os.environ["BENCHMARK_SPREADSHEET_ID"]
    # questions_df = evaluator.load_questions_from_sheet(sheet_id)

    # Create or load dataset
    # evaluator.create_or_load_dataset(questions_df)
    # evaluator.load_dataset(self.config.dataset_name)

    # Run evaluation
    evaluator.run_evaluation()


if __name__ == "__main__":
    main()