LLM-Comparison-Hub / round_robin_evaluator.py
chunchu-08's picture
Enhanced evaluation system
d46a635
import os
from openai import OpenAI
import anthropic
import google.generativeai as genai
from dotenv import load_dotenv
import csv
import json
import re
# Load environment variables
load_dotenv()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
def safe_parse_json(text):
"""Extract and parse JSON from a possibly noisy LLM output."""
try:
match = re.search(r'{.*}', text, re.DOTALL)
if match:
return json.loads(match.group())
except Exception as e:
print(f"[Safe JSON Parse Error] {e}")
return None
def evaluate_response(evaluator_model, prompt, target_model, response_text):
"""Evaluate a response using the specified evaluator model."""
evaluation_prompt = (
f"You are an AI tasked with evaluating another model's response.\n"
f"Here is the original prompt: \"{prompt}\"\n"
f"Here is the response from {target_model}: \"{response_text}\"\n\n"
f"Evaluate this response on the following criteria from 0 (worst) to 1 (best):\n"
f"- Helpfulness\n- Correctness\n- Coherence\n- Tone\n- Accuracy\n"
f"- Relevance\n- Completeness\n- Clarity\n\n"
f"Return ONLY a valid JSON object with the following keys:\n"
f"{{\n"
f" \"helpfulness\": <float>,\n"
f" \"correctness\": <float>,\n"
f" \"coherence\": <float>,\n"
f" \"tone_score\": <float>,\n"
f" \"accuracy\": <float>,\n"
f" \"relevance\": <float>,\n"
f" \"completeness\": <float>,\n"
f" \"clarity\": <float>,\n"
f" \"reasoning\": \"explanation\",\n"
f" \"notes\": \"additional remarks\"\n"
f"}}"
)
try:
if evaluator_model == "GPT-4":
response = openai_client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": evaluation_prompt}],
temperature=0.3
)
result = response.choices[0].message.content
elif evaluator_model == "Claude 3":
response = anthropic_client.messages.create(
model="claude-3-opus-20240229",
max_tokens=1000,
temperature=0.3,
messages=[{"role": "user", "content": evaluation_prompt}]
)
result = response.content[0].text
elif evaluator_model == "Gemini 1.5":
model = genai.GenerativeModel("gemini-1.5-pro")
eval_response = model.generate_content(evaluation_prompt)
result = eval_response.text
else:
print(f"Unknown evaluator model: {evaluator_model}")
return None
parsed = safe_parse_json(result)
if parsed:
return parsed
else:
print(f"Failed to parse JSON from {evaluator_model} evaluation")
return None
except Exception as e:
print(f"Error in {evaluator_model} evaluation: {str(e)}")
return None
def comprehensive_round_robin_evaluation(responses_dict, prompt):
print("\nStarting comprehensive round-robin evaluation...")
evaluation_matrix = {
"GPT-4": ["Claude 3", "Gemini 1.5"],
"Claude 3": ["GPT-4", "Gemini 1.5"],
"Gemini 1.5": ["GPT-4", "Claude 3"]
}
comprehensive_results = {}
for target_model, response_text in responses_dict.items():
print(f"\nCollecting evaluations for {target_model}...")
comprehensive_results[target_model] = {
'response': response_text,
'evaluations': {},
'average_scores': {}
}
for evaluator in evaluation_matrix[target_model]:
print(f" {evaluator} evaluating {target_model}...")
evaluation = evaluate_response(evaluator, prompt, target_model, response_text)
if evaluation:
comprehensive_results[target_model]['evaluations'][evaluator] = evaluation
print(f" {evaluator} evaluation completed")
else:
print(f" {evaluator} evaluation failed")
if comprehensive_results[target_model]['evaluations']:
metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
'accuracy', 'relevance', 'completeness', 'clarity']
for metric in metrics:
scores = [
eval_data[metric]
for eval_data in comprehensive_results[target_model]['evaluations'].values()
if metric in eval_data and isinstance(eval_data[metric], (int, float))
]
comprehensive_results[target_model]['average_scores'][metric] = round(sum(scores) / len(scores), 3) if scores else 0.5
print(f"\nComprehensive evaluation completed for {len(comprehensive_results)} models")
return comprehensive_results
def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
if timestamp is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"results/comprehensive_eval_{timestamp}.csv"
os.makedirs("results", exist_ok=True)
rows = []
for model, data in comprehensive_results.items():
avg_scores = data.get('average_scores', {})
for evaluator, evaluation in data.get('evaluations', {}).items():
row = {
'timestamp': timestamp,
'prompt': prompt,
'target_model': model,
'evaluator': evaluator,
'response': data['response'],
'helpfulness': evaluation.get('helpfulness', 0.5),
'correctness': evaluation.get('correctness', 0.5),
'coherence': evaluation.get('coherence', 0.5),
'tone_score': evaluation.get('tone_score', 0.5),
'accuracy': evaluation.get('accuracy', 0.5),
'relevance': evaluation.get('relevance', 0.5),
'completeness': evaluation.get('completeness', 0.5),
'clarity': evaluation.get('clarity', 0.5),
'reasoning': evaluation.get('reasoning', ''),
'notes': evaluation.get('notes', ''),
'avg_helpfulness': avg_scores.get('helpfulness', 0.5),
'avg_correctness': avg_scores.get('correctness', 0.5),
'avg_coherence': avg_scores.get('coherence', 0.5),
'avg_tone_score': avg_scores.get('tone_score', 0.5),
'avg_accuracy': avg_scores.get('accuracy', 0.5),
'avg_relevance': avg_scores.get('relevance', 0.5),
'avg_completeness': avg_scores.get('completeness', 0.5),
'avg_clarity': avg_scores.get('clarity', 0.5)
}
rows.append(row)
if rows:
fieldnames = list(rows[0].keys())
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
print(f"Results saved to {filename}")
return filename
else:
print("No results to save")
return None