AI_project / eval_process_dataset.py
Eoin McGrath
initial commit
257dcc1
import pandas as pd
from dotenv import load_dotenv
from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
from llama_index.llms.openai import OpenAI
from utils import load_db # Assuming utils.py is in the same directory
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import asyncio
import nest_asyncio
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
Settings.embed_model = OpenAIEmbedding(
model="text-embedding-ada-002",
timeout=60, # Increase timeout to 60 seconds
max_retries=5, # Increase max retries
retry_delay=2 # Wait 2 seconds between retries
)
# Create a debug handler to see more info about API calls
# debug_handler = LlamaDebugHandler(print_trace_on_end=True)
# callback_manager = CallbackManager([debug_handler])
# Configure global settings
# Settings.callback_manager = callback_manager
Settings.retry_policy = {
"max_retries": 5,
"retry_delay": 2.0,
"exponential_backoff": True
}
async def run_evaluation():
load_dotenv()
index = load_db()
rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset.json")
# Define an LLM as a judge
llm_gpt4o = OpenAI(temperature=0, model="gpt-4o")
llm_gpt4o_mini = OpenAI(temperature=0, model="gpt-4o-mini")
# Initiate the faithfulnes and relevancy evaluator objects
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4o)
relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4o)
# Extract the questions from the dataset
queries = list(rag_eval_dataset.queries.values())
# Limit to first 20 question to save time (!!remove this line in production!!)
batch_eval_queries = queries[:5]
# The batch evaluator runs the evaluation in batches
runner = BatchEvalRunner(
{"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
workers=8,
)
# Define a for-loop to try different `similarity_top_k` values
for i in [2,4,6]:
# Set query engine with different number of returned chunks
query_engine = index.as_query_engine(similarity_top_k=i, llm=llm_gpt4o_mini)
# Run the evaluation
eval_results = await runner.aevaluate_queries(query_engine, queries=batch_eval_queries)
# Printing the results
faithfulness_score = sum(
result.passing for result in eval_results["faithfulness"]
) / len(eval_results["faithfulness"])
print(f"top_{i} faithfulness_score: {faithfulness_score}")
relevancy_score = sum(result.passing for result in eval_results["relevancy"]) / len(
eval_results["relevancy"]
)
print(f"top_{i} relevancy_score: {relevancy_score}")
print("="*15)
# Run the async function
if __name__ == "__main__":
asyncio.run(run_evaluation())