Spaces:
Sleeping
Sleeping
| from typing import Dict, Any | |
| from langsmith import Client | |
| from langchain.smith import RunEvalConfig | |
| from langsmith.evaluation import run_evaluator | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY") | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| class LangSmithEvaluator: | |
| """Handles evaluation using LangSmith""" | |
| def __init__(self, api_key: str = LANGSMITH_API_KEY): | |
| self.client = Client(api_key=api_key) | |
| self.evaluator_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",google_api_key=GEMINI_API_KEY) | |
| def evaluate_response(self, query: str, response: str, reference: str = None) -> Dict[str, Any]: | |
| """Evaluate an LLM response against a query and optional reference""" | |
| eval_config = RunEvalConfig( | |
| evaluators=[ | |
| "criteria", | |
| "embedding_distance", | |
| ], | |
| custom_evaluators=[ | |
| run_evaluator.RunEvalConfig( | |
| evaluator="correctness", | |
| llm=self.evaluator_llm | |
| ), | |
| run_evaluator.RunEvalConfig( | |
| evaluator="helpfulness", | |
| llm=self.evaluator_llm | |
| ), | |
| run_evaluator.RunEvalConfig( | |
| evaluator="relevance", | |
| llm=self.evaluator_llm | |
| ), | |
| ] | |
| ) | |
| try: | |
| # Create dataset with single example | |
| dataset = self.client.create_dataset( | |
| "evaluation_dataset", | |
| description="Dataset for evaluation of LLM responses" | |
| ) | |
| # Add example | |
| self.client.create_example( | |
| inputs={"question": query}, | |
| outputs={"answer": response}, | |
| dataset_id=dataset.id | |
| ) | |
| # Run evaluation | |
| evaluation_results = self.client.run_evaluation( | |
| dataset_id=dataset.id, | |
| config=eval_config | |
| ) | |
| return evaluation_results | |
| except Exception as e: | |
| print(f"Error during evaluation: {str(e)}") | |
| return {"error": str(e)} | |