import os import json import sys import argparse from dotenv import load_dotenv load_dotenv() sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from rag import search_bugs, init_store from openai import OpenAI from ragas import evaluate, EvaluationDataset, SingleTurnSample from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision from ragas.llms import LlamaIndexLLMWrapper from llama_index.llms.openai import OpenAI as LlamaOpenAI DATASET = os.path.join(os.path.dirname(os.path.abspath(__file__)), "eval_dataset.json") RESULTS = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results.json") def get_answer(query, contexts, api_key): client = OpenAI(api_key=api_key) context = "\n".join(contexts) resp = client.chat.completions.create( model="gpt-4o", max_tokens=150, messages=[{ "role": "user", "content": f"Query: {query}\n\nContext:\n{context}\n\nAnswer in 2 sentences:" }] ) return resp.choices[0].message.content.strip() def build_sample(item, api_key): query = item["query"] bugs = search_bugs(query, top_k=5) contexts = [f"{b['title']}: {b['description']}" for b in bugs] answer = get_answer(query, contexts, api_key) print(f"query : {query}") print(f"answer: {answer}\n") return SingleTurnSample( user_input=query, response=answer, retrieved_contexts=contexts, reference=item["reference_answer"] ) def run_eval(api_key): dataset = json.load(open(DATASET)) print(f"Loaded {len(dataset)} queries\n") init_store() llm = LlamaOpenAI(model="gpt-4o", api_key=api_key) evaluator_llm = LlamaIndexLLMWrapper(llm) samples = [build_sample(item, api_key) for item in dataset] results = evaluate( EvaluationDataset(samples=samples), metrics=[ Faithfulness(llm=evaluator_llm), AnswerRelevancy(llm=evaluator_llm), ContextPrecision(llm=evaluator_llm), ] ) df = results.to_pandas() print("=" * 40) print("RAGAS RESULTS") print("=" * 40) print(f"Faithfulness : {df['faithfulness'].mean():.3f}") print(f"Answer Relevancy : {df['answer_relevancy'].mean():.3f}") print(f"Context Precision : {df['context_precision'].mean():.3f}") print("=" * 40) json.dump({ "faithfulness": round(float(df["faithfulness"].mean()), 3), "answer_relevancy": round(float(df["answer_relevancy"].mean()), 3), "context_precision": round(float(df["context_precision"].mean()), 3), }, open(RESULTS, "w"), indent=2) print("Saved to eval/results.json") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY")) args = parser.parse_args() if not args.api_key: print("Error: OPENAI_API_KEY not found. Set it in .env or pass --api-key") sys.exit(1) run_eval(args.api_key)