Spaces:
Sleeping
Sleeping
File size: 2,379 Bytes
f974658 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
from typing import Dict, Any
from langsmith import Client
from langchain.smith import RunEvalConfig
from langsmith.evaluation import run_evaluator
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv()
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
class LangSmithEvaluator:
"""Handles evaluation using LangSmith"""
def __init__(self, api_key: str = LANGSMITH_API_KEY):
self.client = Client(api_key=api_key)
self.evaluator_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",google_api_key=GEMINI_API_KEY)
def evaluate_response(self, query: str, response: str, reference: str = None) -> Dict[str, Any]:
"""Evaluate an LLM response against a query and optional reference"""
eval_config = RunEvalConfig(
evaluators=[
"criteria",
"embedding_distance",
],
custom_evaluators=[
run_evaluator.RunEvalConfig(
evaluator="correctness",
llm=self.evaluator_llm
),
run_evaluator.RunEvalConfig(
evaluator="helpfulness",
llm=self.evaluator_llm
),
run_evaluator.RunEvalConfig(
evaluator="relevance",
llm=self.evaluator_llm
),
]
)
try:
# Create dataset with single example
dataset = self.client.create_dataset(
"evaluation_dataset",
description="Dataset for evaluation of LLM responses"
)
# Add example
self.client.create_example(
inputs={"question": query},
outputs={"answer": response},
dataset_id=dataset.id
)
# Run evaluation
evaluation_results = self.client.run_evaluation(
dataset_id=dataset.id,
config=eval_config
)
return evaluation_results
except Exception as e:
print(f"Error during evaluation: {str(e)}")
return {"error": str(e)}
|