Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

evaluation-framework / evaluator.py

manayporwal07

Update evaluator.py

ccf526b verified 6 months ago

raw

history blame

23.5 kB

	#####################################################################################################################################################################
	# import re
	# import json
	# import torch
	# import pandas as pd
	# import matplotlib.pyplot as plt
	# import seaborn as sns
	# import os
	# import uuid
	# from transformers import AutoTokenizer, AutoModelForSequenceClassification
	# from sentence_transformers import SentenceTransformer, util

	# import matplotlib.pyplot as plt
	# import numpy as np

	# def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
	# """
	# Radar chart comparing multiple agents across metrics.
	# """
	# labels = metrics
	# num_vars = len(labels)

	# # Compute angle for each axis
	# angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
	# angles += angles[:1] # close loop

	# fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

	# for agent in agents:
	# values = []
	# for m in metrics:
	# mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
	# values.append(mean_val if not np.isnan(mean_val) else 0)
	# values += values[:1]
	# ax.plot(angles, values, label=agent, linewidth=2)
	# ax.fill(angles, values, alpha=0.25)

	# ax.set_xticks(angles[:-1])
	# ax.set_xticklabels(labels)
	# ax.set_yticklabels([])
	# ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
	# ax.set_title("Agent Performance Radar Chart")

	# plt.tight_layout()
	# plt.savefig(out_path)
	# plt.close()
	# return out_path

	# import seaborn as sns

	# def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
	# pivot = metrics_df.groupby("agent")[
	# ["accuracy", "hallucination", "instruction_following", "coherence", "assumption"]
	# ].mean()

	# plt.figure(figsize=(8, 5))
	# sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".2f")
	# plt.title("Agent × Metric Heatmap")
	# plt.tight_layout()
	# plt.savefig(out_path)
	# plt.close()
	# return out_path

	# # --------------------------
	# # MODEL LOADING
	# # --------------------------
	# NLI_MODEL = "textattack/roberta-base-MNLI"
	# EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	# # Load NLI model & tokenizer
	# nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
	# nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
	# nli_model.to("cpu")
	# nli_model.eval()

	# # Load embedding model
	# embed_model = SentenceTransformer(EMBED_MODEL)

	# # Label mapping from config
	# id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}


	# # --------------------------
	# # METRIC FUNCTIONS
	# # --------------------------
	# def check_instruction_following(prompt: str, response: str) -> float:
	# """Embedding-based similarity between prompt and response."""
	# if not prompt or not response:
	# return 0.0
	# p_emb = embed_model.encode(prompt, convert_to_tensor=True)
	# r_emb = embed_model.encode(response, convert_to_tensor=True)
	# sim = float(util.cos_sim(p_emb, r_emb).item())
	# return round(max(0.0, min(1.0, sim)), 3)


	# def check_hallucination(reference: str, response: str) -> float:
	# """
	# Single hallucination score:
	# Entailment prob - Contradiction prob (normalized to [0,1]).
	# Higher = less hallucination.
	# """
	# if not reference or not response:
	# return 0.0
	# with torch.no_grad():
	# inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
	# outputs = nli_model(**inputs)
	# probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

	# entail_prob, contra_prob = 0.0, 0.0
	# for idx, p in enumerate(probs):
	# label = id2label.get(idx, "")
	# if "ENTAIL" in label:
	# entail_prob = float(p)
	# elif "CONTRA" in label:
	# contra_prob = float(p)

	# score = entail_prob - contra_prob
	# score = (score + 1) / 2 # normalize [-1,1] → [0,1]
	# return round(max(0.0, min(1.0, score)), 3)


	# def check_assumption(response: str) -> float:
	# """Detect speculative/hedging terms."""
	# if not response:
	# return 0.0
	# speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
	# count = sum(1 for t in speculative_terms if t in response.lower())
	# score = 1.0 - min(count / 5.0, 1.0) # smoother decay
	# return round(score, 3)


	# def check_coherence(response: str) -> float:
	# """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
	# if not response:
	# return 0.0
	# words = len(re.findall(r"\w+", response))
	# sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
	# if words < 5:
	# return 0.3
	# if words > 200:
	# return 0.5
	# base = min(1.0, (words / 50.0) + (sents / 5.0))
	# return round(max(0.4, min(base, 0.95)), 3)


	# def check_accuracy(reference: str, response: str) -> float:
	# """Semantic similarity between reference and response via embeddings (cosine)."""
	# if not reference or not response:
	# return 0.0
	# ref_emb = embed_model.encode(reference, convert_to_tensor=True)
	# resp_emb = embed_model.encode(response, convert_to_tensor=True)
	# sim = float(util.cos_sim(ref_emb, resp_emb).item())
	# return round(max(0.0, min(1.0, sim)), 3)


	# # --------------------------
	# # ROW & DF EVALUATION
	# # --------------------------
	# def evaluate_row(row):
	# prompt = row.get("prompt", "")
	# response = row.get("response", "")
	# reference = row.get("reference", "")

	# metrics = {
	# "task_id": row.get("task_id", ""),
	# "agent": row.get("agent", ""),
	# "instruction_following": check_instruction_following(prompt, response),
	# "hallucination": check_hallucination(reference, response),
	# "assumption": check_assumption(response),
	# "coherence": check_coherence(response),
	# "accuracy": check_accuracy(reference, response),
	# }

	# # Weighted avg score (you can adjust weights)
	# metrics["final_score"] = round(
	# 0.25 * metrics["instruction_following"]
	# + 0.25 * metrics["accuracy"]
	# + 0.2 * metrics["hallucination"]
	# + 0.15 * metrics["coherence"]
	# + 0.15 * metrics["assumption"],
	# 3,
	# )
	# return metrics


	# def evaluate_dataframe(df: pd.DataFrame):
	# metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")

	# # Leaderboard
	# leaderboard = (
	# metrics_df.groupby(["agent", "task_id"])["final_score"]
	# .mean()
	# .reset_index()
	# )


	# # # Plots
	# # images = []
	# # Existing images list
	# images = []

	# # Add radar chart
	# radar_path = plot_radar_chart(metrics_df, agents=df["agent"].unique(),
	#
	###############################################################################################################################

	import re
	import json
	import torch
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import os
	import uuid
	import numpy as np
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	AutoModelForCausalLM,
	pipeline
	)
	from sentence_transformers import SentenceTransformer, util
	import evaluate
	from sklearn.metrics import accuracy_score, f1_score
	from collections import defaultdict
	import warnings
	warnings.filterwarnings('ignore')

	# --------------------------
	# MODEL LOADING
	# --------------------------
	NLI_MODEL = "microsoft/deberta-v2-xlarge-mnli"
	EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
	LLM_JUDGE_MODEL = "microsoft/DialoGPT-large" # Can be replaced with more powerful models

	# Load NLI model & tokenizer
	nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
	nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
	nli_model.to("cuda" if torch.cuda.is_available() else "cpu")
	nli_model.eval()

	# Load embedding model
	embed_model = SentenceTransformer(EMBED_MODEL)

	# Load LLM judge
	judge_tokenizer = AutoTokenizer.from_pretrained(LLM_JUDGE_MODEL)
	judge_model = AutoModelForCausalLM.from_pretrained(LLM_JUDGE_MODEL)
	judge_model.to("cuda" if torch.cuda.is_available() else "cpu")
	judge_model.eval()

	# Load additional evaluation metrics
	bertscore = evaluate.load("bertscore")
	bleu = evaluate.load("bleu")
	rouge = evaluate.load("rouge")

	# Label mapping from config
	id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}

	# --------------------------
	# IMPROVED METRIC FUNCTIONS
	# --------------------------
	def check_instruction_following(prompt: str, response: str) -> float:
	"""Improved instruction following using NLI and semantic similarity."""
	if not prompt or not response:
	return 0.0

	# Method 1: NLI-based evaluation
	with torch.no_grad():
	inputs = nli_tokenizer.encode_plus(
	prompt,
	response,
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(nli_model.device)

	outputs = nli_model(**inputs)
	probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

	entail_prob, neutral_prob = 0.0, 0.0
	for idx, p in enumerate(probs):
	label = id2label.get(idx, "")
	if "ENTAIL" in label:
	entail_prob = float(p)
	elif "NEUTRAL" in label:
	neutral_prob = float(p)

	nli_score = entail_prob + (neutral_prob * 0.5)

	# Method 2: Semantic similarity
	p_emb = embed_model.encode(prompt, convert_to_tensor=True)
	r_emb = embed_model.encode(response, convert_to_tensor=True)
	sim_score = float(util.cos_sim(p_emb, r_emb).item())

	# Combined score (weighted average)
	final_score = 0.7 * nli_score + 0.3 * sim_score
	return round(max(0.0, min(1.0, final_score)), 3)

	def check_hallucination(reference: str, response: str) -> float:
	"""Enhanced hallucination detection using multiple methods."""
	if not reference or not response:
	return 0.0

	# Method 1: NLI-based contradiction detection
	with torch.no_grad():
	inputs = nli_tokenizer.encode_plus(
	reference,
	response,
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(nli_model.device)

	outputs = nli_model(**inputs)
	probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

	contra_prob, neutral_prob = 0.0, 0.0
	for idx, p in enumerate(probs):
	label = id2label.get(idx, "")
	if "CONTRA" in label:
	contra_prob = float(p)
	elif "NEUTRAL" in label:
	neutral_prob = float(p)

	nli_hallucination_score = contra_prob + (neutral_prob * 0.3)

	# Method 2: Semantic similarity penalty
	ref_emb = embed_model.encode(reference, convert_to_tensor=True)
	resp_emb = embed_model.encode(response, convert_to_tensor=True)
	semantic_sim = float(util.cos_sim(ref_emb, resp_emb).item())

	# Combined score: Higher when less hallucination
	hallucination_score = 1.0 - (0.7 * nli_hallucination_score + 0.3 * (1 - semantic_sim))
	return round(max(0.0, min(1.0, hallucination_score)), 3)

	def check_assumption(response: str) -> float:
	"""Improved assumption detection using pattern matching and LLM judgment."""
	if not response:
	return 0.0

	# Pattern-based detection
	speculative_patterns = [
	r"\b(maybe\|perhaps\|possibly\|probably\|might\|could\|would\|should)\b",
	r"\b(I think\|I believe\|I guess\|I suppose\|I assume)\b",
	r"\b(it seems\|it appears\|it looks like)\b",
	r"\b(likely\|unlikely\|presumably\|arguably)\b",
	r"\b(some\|many\|most\|often\|usually\|generally\|typically)\b"
	]

	pattern_count = sum(
	len(re.findall(pattern, response.lower()))
	for pattern in speculative_patterns
	)

	# Length normalization
	word_count = len(response.split())
	pattern_score = min(1.0, pattern_count / max(1, word_count / 5))

	# LLM-based judgment
	assumption_prompt = f"""
	Determine if the following text contains assumptions, speculation, or hedging language.
	Text: {response}
	Answer with only 'yes' or 'no':
	"""

	with torch.no_grad():
	inputs = judge_tokenizer.encode(assumption_prompt, return_tensors="pt")
	outputs = judge_model.generate(
	inputs,
	max_length=len(inputs[0]) + 3,
	pad_token_id=judge_tokenizer.eos_token_id
	)
	judgment = judge_tokenizer.decode(outputs[0], skip_special_tokens=True)

	llm_score = 0.0 if "yes" in judgment.lower() else 1.0

	# Combined score
	final_score = 0.6 * (1 - pattern_score) + 0.4 * llm_score
	return round(final_score, 3)

	def check_coherence(response: str) -> float:
	"""Enhanced coherence evaluation using multiple linguistic features."""
	if not response:
	return 0.0

	# Feature 1: Sentence structure
	sentences = re.split(r'[.!?]+', response)
	sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
	num_sentences = len(sentences)

	if num_sentences == 0:
	return 0.0

	# Feature 2: Sentence length variation
	sent_lengths = [len(s.split()) for s in sentences]
	length_variance = np.var(sent_lengths) if len(sent_lengths) > 1 else 0
	length_score = 1.0 - min(1.0, length_variance / 100)

	# Feature 3: Transition words
	transition_words = [
	'however', 'therefore', 'moreover', 'furthermore', 'consequently',
	'additionally', 'likewise', 'similarly', 'nevertheless', 'nonetheless'
	]

	transition_count = sum(1 for word in transition_words
	if word in response.lower())
	transition_score = min(1.0, transition_count / 3)

	# Feature 4: Repetition penalty
	words = response.lower().split()
	unique_words = set(words)
	repetition_ratio = len(unique_words) / max(1, len(words))

	# Combined score
	coherence_score = (
	0.3 * min(1.0, num_sentences / 5) +
	0.2 * length_score +
	0.3 * transition_score +
	0.2 * repetition_ratio
	)

	return round(max(0.0, min(1.0, coherence_score)), 3)

	def check_accuracy(reference: str, response: str) -> float:
	"""Enhanced accuracy evaluation using multiple metrics."""
	if not reference or not response:
	return 0.0

	# BERTScore
	bert_results = bertscore.compute(
	predictions=[response],
	references=[reference],
	lang="en",
	model_type=EMBED_MODEL
	)
	bert_f1 = bert_results['f1'][0]

	# ROUGE-L
	rouge_results = rouge.compute(
	predictions=[response],
	references=[reference],
	use_stemmer=True
	)
	rouge_l = rouge_results['rougeL']

	# BLEU (for shorter responses)
	try:
	bleu_results = bleu.compute(
	predictions=[response.split()],
	references=[[reference.split()]]
	)
	bleu_score = bleu_results['bleu']
	except:
	bleu_score = 0.0

	# Semantic similarity
	ref_emb = embed_model.encode(reference, convert_to_tensor=True)
	resp_emb = embed_model.encode(response, convert_to_tensor=True)
	semantic_sim = float(util.cos_sim(ref_emb, resp_emb).item())

	# Combined score (weighted average)
	accuracy_score = (
	0.4 * bert_f1 +
	0.3 * rouge_l +
	0.1 * bleu_score +
	0.2 * semantic_sim
	)

	return round(max(0.0, min(1.0, accuracy_score)), 3)

	def check_relevance(prompt: str, response: str) -> float:
	"""Check how relevant the response is to the prompt."""
	if not prompt or not response:
	return 0.0

	# Encode both prompt and response
	p_emb = embed_model.encode(prompt, convert_to_tensor=True)
	r_emb = embed_model.encode(response, convert_to_tensor=True)

	# Calculate cosine similarity
	similarity = float(util.cos_sim(p_emb, r_emb).item())

	return round(max(0.0, min(1.0, similarity)), 3)

	def check_fluency(response: str) -> float:
	"""Check the fluency of the response using perplexity-based approach."""
	if not response:
	return 0.0

	# Load a fluency model (perplexity-based)
	fluency_checker = pipeline(
	"text-classification",
	model="textattack/roberta-base-CoLA",
	device=0 if torch.cuda.is_available() else -1
	)

	try:
	# Split into sentences if too long
	sentences = re.split(r'[.!?]+', response)
	sentences = [s.strip() for s in sentences if len(s.strip()) > 5]

	if not sentences:
	return 0.5

	# Check each sentence
	fluency_scores = []
	for sent in sentences[:3]: # Limit to first 3 sentences
	result = fluency_checker(sent[:512]) # Truncate if too long
	score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
	fluency_scores.append(score)

	avg_fluency = sum(fluency_scores) / len(fluency_scores)
	return round(avg_fluency, 3)
	except:
	# Fallback to simple heuristic
	words = response.split()
	if len(words) < 3:
	return 0.3
	return 0.7

	# --------------------------
	# ROW & DF EVALUATION
	# --------------------------
	def evaluate_row(row):
	prompt = row.get("prompt", "")
	response = row.get("response", "")
	reference = row.get("reference", "")

	metrics = {
	"task_id": row.get("task_id", ""),
	"agent": row.get("agent", ""),
	"instruction_following": check_instruction_following(prompt, response),
	"hallucination": check_hallucination(reference, response),
	"assumption": check_assumption(response),
	"coherence": check_coherence(response),
	"accuracy": check_accuracy(reference, response),
	"relevance": check_relevance(prompt, response),
	"fluency": check_fluency(response),
	}

	# Weighted avg score (adjust weights as needed)
	metrics["final_score"] = round(
	0.20 * metrics["instruction_following"] +
	0.20 * metrics["accuracy"] +
	0.15 * metrics["hallucination"] +
	0.10 * metrics["coherence"] +
	0.10 * metrics["assumption"] +
	0.15 * metrics["relevance"] +
	0.10 * metrics["fluency"],
	3,
	)
	return metrics

	# --------------------------
	# VISUALIZATION FUNCTIONS
	# --------------------------
	def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
	"""Radar chart comparing multiple agents across metrics."""
	labels = metrics
	num_vars = len(labels)

	# Compute angle for each axis
	angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
	angles += angles[:1] # close loop

	fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

	for agent in agents:
	values = []
	for m in metrics:
	mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
	values.append(mean_val if not np.isnan(mean_val) else 0)
	values += values[:1]
	ax.plot(angles, values, label=agent, linewidth=2)
	ax.fill(angles, values, alpha=0.25)

	ax.set_xticks(angles[:-1])
	ax.set_xticklabels(labels)
	ax.set_yticklabels([])
	ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
	ax.set_title("Agent Performance Radar Chart")

	plt.tight_layout()
	plt.savefig(out_path)
	plt.close()
	return out_path

	def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
	"""Heatmap of agent performance across metrics."""
	metrics = ["accuracy", "hallucination", "instruction_following",
	"coherence", "assumption", "relevance", "fluency"]

	pivot = metrics_df.groupby("agent")[metrics].mean()

	plt.figure(figsize=(10, 6))
	sns.heatmap(pivot, annot=True, cmap="YlGnBu", fmt=".3f", center=0.5)
	plt.title("Agent × Metric Heatmap")
	plt.tight_layout()
	plt.savefig(out_path)
	plt.close()
	return out_path

	def plot_score_distribution(metrics_df, out_path="/tmp/distribution.png"):
	"""Distribution of final scores by agent."""
	plt.figure(figsize=(10, 6))
	agents = metrics_df['agent'].unique()

	for agent in agents:
	agent_scores = metrics_df[metrics_df['agent'] == agent]['final_score']
	sns.kdeplot(agent_scores, label=agent, fill=True, alpha=0.3)

	plt.xlabel('Final Score')
	plt.ylabel('Density')
	plt.title('Distribution of Final Scores by Agent')
	plt.legend()
	plt.tight_layout()
	plt.savefig(out_path)
	plt.close()
	return out_path

	def plot_metric_correlation(metrics_df, out_path="/tmp/correlation.png"):
	"""Correlation matrix between different metrics."""
	metrics = ["accuracy", "hallucination", "instruction_following",
	"coherence", "assumption", "relevance", "fluency", "final_score"]

	plt.figure(figsize=(10, 8))
	correlation_matrix = metrics_df[metrics].corr()
	sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", center=0,
	fmt=".2f", square=True)
	plt.title('Correlation Between Metrics')
	plt.tight_layout()
	plt.savefig(out_path)
	plt.close()
	return out_path

	def plot_agent_comparison(metrics_df, out_path="/tmp/agent_comparison.png"):
	"""Bar chart comparing agent performance across metrics."""
	metrics = ["accuracy", "hallucination", "instruction_following",
	"coherence", "assumption", "relevance", "fluency"]

	agent_means = metrics_df.groupby('agent')[metrics].mean()

	plt.figure(figsize=(12, 6))
	agent_means.plot(kind='bar', colormap='Set3')
	plt.title('Agent Performance Across Metrics')
	plt.xlabel('Agent')
	plt.ylabel('Score')
	plt.xticks(rotation=45)
	plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
	plt.tight_layout()
	plt.savefig(out_path)
	plt.close()
	return out_path

	# --------------------------
	# MAIN EVALUATION FUNCTION
	# --------------------------
	def evaluate_dataframe(df: pd.DataFrame):
	"""Evaluate a dataframe of agent responses."""
	metrics_df = df.apply(evaluate_row, axis=1, result_type='expand')

	# Leaderboard
	leaderboard = (
	metrics_df.groupby(["agent", "task_id"])["final_score"]
	.mean()
	.reset_index()
	)

	# Generate visualizations
	images = []

	# Add all visualizations
	agents = df["agent"].unique()
	metrics = ["accuracy", "hallucination", "instruction_following",
	"coherence", "assumption", "relevance", "fluency"]

	radar_path = plot_radar_chart(metrics_df, agents, metrics)
	images.append((radar_path, "Radar Chart: Agent vs Metrics"))

	heatmap_path = plot_heatmap(metrics_df)
	images.append((heatmap_path, "Heatmap: Agent vs Metrics"))

	distribution_path = plot_score_distribution(metrics_df)
	images.append((distribution_path, "Score Distribution by Agent"))

	correlation_path = plot_metric_correlation(metrics_df)
	images.append((correlation_path, "Metric Correlation Matrix"))

	agent_comparison_path = plot_agent_comparison(metrics_df)
	images.append((agent_comparison_path, "Agent Comparison Chart"))

	return metrics_df, images, leaderboard