File size: 2,379 Bytes
f974658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from typing import Dict, Any
from langsmith import Client
from langchain.smith import RunEvalConfig
from langsmith.evaluation import run_evaluator
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv()

LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

class LangSmithEvaluator:
    """Handles evaluation using LangSmith"""
    
    def __init__(self, api_key: str = LANGSMITH_API_KEY):
        self.client = Client(api_key=api_key)
        self.evaluator_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",google_api_key=GEMINI_API_KEY)
    
    def evaluate_response(self, query: str, response: str, reference: str = None) -> Dict[str, Any]:
        """Evaluate an LLM response against a query and optional reference"""
        eval_config = RunEvalConfig(
            evaluators=[
                "criteria",
                "embedding_distance",
            ],
            custom_evaluators=[
                run_evaluator.RunEvalConfig(
                    evaluator="correctness",
                    llm=self.evaluator_llm
                ),
                run_evaluator.RunEvalConfig(
                    evaluator="helpfulness",
                    llm=self.evaluator_llm
                ),
                run_evaluator.RunEvalConfig(
                    evaluator="relevance",
                    llm=self.evaluator_llm
                ),
            ]
        )
        
        try:
            # Create dataset with single example
            dataset = self.client.create_dataset(
                "evaluation_dataset",
                description="Dataset for evaluation of LLM responses"
            )
            
            # Add example
            self.client.create_example(
                inputs={"question": query},
                outputs={"answer": response},
                dataset_id=dataset.id
            )
            
            # Run evaluation
            evaluation_results = self.client.run_evaluation(
                dataset_id=dataset.id,
                config=eval_config
            )
            
            return evaluation_results
        except Exception as e:
            print(f"Error during evaluation: {str(e)}")
            return {"error": str(e)}