Spaces:

AmritSbisht
/

Doc-Weather-Bot

Sleeping

App Files Files Community

Doc-Weather-Bot / utils /evaluation.py

AmritSbisht

Upload 25 files

f974658 verified 9 months ago

raw

history blame contribute delete

2.38 kB

	from typing import Dict, Any
	from langsmith import Client
	from langchain.smith import RunEvalConfig
	from langsmith.evaluation import run_evaluator
	from langchain_google_genai import ChatGoogleGenerativeAI
	import os
	from dotenv import load_dotenv
	load_dotenv()

	LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

	class LangSmithEvaluator:
	"""Handles evaluation using LangSmith"""

	def __init__(self, api_key: str = LANGSMITH_API_KEY):
	self.client = Client(api_key=api_key)
	self.evaluator_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",google_api_key=GEMINI_API_KEY)

	def evaluate_response(self, query: str, response: str, reference: str = None) -> Dict[str, Any]:
	"""Evaluate an LLM response against a query and optional reference"""
	eval_config = RunEvalConfig(
	evaluators=[
	"criteria",
	"embedding_distance",
	],
	custom_evaluators=[
	run_evaluator.RunEvalConfig(
	evaluator="correctness",
	llm=self.evaluator_llm
	),
	run_evaluator.RunEvalConfig(
	evaluator="helpfulness",
	llm=self.evaluator_llm
	),
	run_evaluator.RunEvalConfig(
	evaluator="relevance",
	llm=self.evaluator_llm
	),
	]
	)

	try:
	# Create dataset with single example
	dataset = self.client.create_dataset(
	"evaluation_dataset",
	description="Dataset for evaluation of LLM responses"
	)

	# Add example
	self.client.create_example(
	inputs={"question": query},
	outputs={"answer": response},
	dataset_id=dataset.id
	)

	# Run evaluation
	evaluation_results = self.client.run_evaluation(
	dataset_id=dataset.id,
	config=eval_config
	)

	return evaluation_results
	except Exception as e:
	print(f"Error during evaluation: {str(e)}")
	return {"error": str(e)}