File size: 3,126 Bytes
257dcc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from dotenv import load_dotenv
from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
from llama_index.llms.openai import OpenAI
from utils import load_db  # Assuming utils.py is in the same directory

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

import asyncio
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-ada-002",
    timeout=60,  # Increase timeout to 60 seconds
    max_retries=5,  # Increase max retries
    retry_delay=2  # Wait 2 seconds between retries
)
# Create a debug handler to see more info about API calls
# debug_handler = LlamaDebugHandler(print_trace_on_end=True)
# callback_manager = CallbackManager([debug_handler])

# Configure global settings
# Settings.callback_manager = callback_manager
Settings.retry_policy = {
    "max_retries": 5,
    "retry_delay": 2.0,
    "exponential_backoff": True
}
async def run_evaluation():
    load_dotenv()
    index = load_db()
    rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset.json")
    # Define an LLM as a judge
    llm_gpt4o = OpenAI(temperature=0, model="gpt-4o")
    llm_gpt4o_mini = OpenAI(temperature=0, model="gpt-4o-mini")
    # Initiate the faithfulnes and relevancy evaluator objects
    faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4o)
    relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4o)
    # Extract the questions from the dataset
    queries = list(rag_eval_dataset.queries.values())
    # Limit to first 20 question to save time (!!remove this line in production!!)
    batch_eval_queries = queries[:5]
    # The batch evaluator runs the evaluation in batches
    runner = BatchEvalRunner(
        {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
        workers=8,
    )
    # Define a for-loop to try different `similarity_top_k` values
    for i in [2,4,6]:
        # Set query engine with different number of returned chunks
        query_engine = index.as_query_engine(similarity_top_k=i, llm=llm_gpt4o_mini)
        # Run the evaluation
        eval_results = await runner.aevaluate_queries(query_engine, queries=batch_eval_queries)
        # Printing the results
        faithfulness_score = sum(
            result.passing for result in eval_results["faithfulness"]
        ) / len(eval_results["faithfulness"])
        print(f"top_{i} faithfulness_score: {faithfulness_score}")
        relevancy_score = sum(result.passing for result in eval_results["relevancy"]) / len(
            eval_results["relevancy"]
        )
        print(f"top_{i} relevancy_score: {relevancy_score}")
        print("="*15)

# Run the async function
if __name__ == "__main__":
    asyncio.run(run_evaluation())