Spaces:
Sleeping
Sleeping
| import json | |
| import pandas as pd | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from qdrant_client import QdrantClient | |
| from ragas import EvaluationDataset, RunConfig, evaluate | |
| from ragas.llms import LangchainLLMWrapper | |
| from ragas.testset import TestsetGenerator | |
| from ragas.embeddings import LangchainEmbeddingsWrapper | |
| from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity | |
| from langchain_openai import ChatOpenAI | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from rag_graph import RagGraph | |
| # Load environment variables | |
| load_dotenv() | |
| USE_FINE_TUNED_EMBEDDINGS = True | |
| # Create data folders | |
| eval_data_filepath = Path("data/evaluation") if not USE_FINE_TUNED_EMBEDDINGS else Path("data/evaluation/finetuned") | |
| eval_data_filepath.mkdir(parents=True, exist_ok=True) | |
| # Necessary for dependencies for DirectoryLoader | |
| import nltk | |
| nltk.download('punkt_tab') | |
| nltk.download('averaged_perceptron_tagger_eng') | |
| # Initialize evaluation generators | |
| eval_gen_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) | |
| eval_judge_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) | |
| eval_gen_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) | |
| def generate_test_data(): | |
| """Generate test data for evaluation""" | |
| # Load documents | |
| loader = DirectoryLoader("data/scraped/clean", glob="*.txt") | |
| docs = loader.load() | |
| # Generate test set data | |
| generator = TestsetGenerator( | |
| llm=eval_gen_llm, | |
| embedding_model=eval_gen_embeddings | |
| ) | |
| print("Generating test set data...") | |
| dataset = generator.generate_with_langchain_docs(docs, testset_size=10) | |
| # Save test set data to file | |
| dataset.to_pandas().to_json(eval_data_filepath / "testset_data.json", orient="records", indent=4) | |
| # Upload test set data to RAGAS | |
| dataset.upload() | |
| def evaluate_rag(): | |
| """Generate RAG responses for evaluation""" | |
| # Create Qdrant client and RAG graph | |
| qdrant_client = QdrantClient(path='data/vectors') | |
| rag_graph = RagGraph(qdrant_client, use_finetuned_embeddings=USE_FINE_TUNED_EMBEDDINGS) | |
| # Load data set from testset_data.json | |
| dataset_df = pd.read_json(eval_data_filepath / "testset_data.json") | |
| dataset = EvaluationDataset.from_pandas(dataset_df) | |
| # Generate Run responses | |
| print("Generating responses...") | |
| for test_row in dataset: | |
| user_input = test_row.user_input | |
| print(f"Generating response for: {user_input}") | |
| response = rag_graph.run(user_input) | |
| test_row.response = response["response"] | |
| test_row.retrieved_contexts = [context.page_content for context in response["context"]] | |
| evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas()) | |
| # Save evaluation data to file | |
| evaluation_dataset.to_pandas().to_json(eval_data_filepath / "evaluation_data.json", orient="records", indent=4) | |
| # Evaluate the responses | |
| print("Evaluating responses...") | |
| result = evaluate( | |
| dataset=evaluation_dataset, | |
| metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()], | |
| llm=eval_judge_llm, | |
| run_config=RunConfig(timeout=360) | |
| ) | |
| # Write evaluation results to file | |
| print("Writing evaluation results to file...") | |
| (eval_data_filepath / "evaluation_results.json").write_text(result.to_pandas().to_json(orient="records", indent=4)) | |
| def calculate_average_evaluation_results(): | |
| """Get the average evaluation results from the evaluation_results.json file""" | |
| evaluation_results = json.loads((eval_data_filepath / "evaluation_results.json").read_text()) | |
| fields = ["context_recall", "faithfulness", "factual_correctness", "answer_relevancy", "context_entity_recall", "noise_sensitivity_relevant"] | |
| # map over evaluation_results and return objects with just the fields we want then convert to a DataFrame | |
| evaluation_results_df = pd.DataFrame(list(map(lambda x: {field: x[field] for field in fields}, evaluation_results))) | |
| # calculate the average of each field | |
| average_results = evaluation_results_df.mean() | |
| # save average results to file as an object with the fields as keys | |
| (eval_data_filepath / "evaluation_results_average.json").write_text(average_results.to_json(orient="index", indent=4)) | |
| # Run RAG with CLI (no streaming) | |
| def main(): | |
| """Test the RAG graph.""" | |
| # generate_test_data() | |
| evaluate_rag() | |
| calculate_average_evaluation_results() | |
| if __name__ == "__main__": | |
| main() | |