Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

evaluation-framework / evaluator.py

manayporwal07

Update evaluator.py

f97959c verified 6 months ago

raw

history blame

15.5 kB

	# """
	# Evaluation module: loads models, computes metrics, and creates visualizations.
	# Lightweight, CPU-friendly, no Java required.
	# """

	# import re
	# import math
	# import uuid
	# from typing import List, Dict, Tuple

	# import numpy as np
	# import pandas as pd
	# import matplotlib.pyplot as plt
	# import seaborn as sns
	# import torch
	# from transformers import AutoTokenizer, AutoModelForSequenceClassification
	# from sentence_transformers import SentenceTransformer, util

	# # --------------------------
	# # MODEL LOADING
	# # --------------------------
	# NLI_MODEL = "textattack/roberta-base-MNLI"
	# EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	# # Load NLI model & tokenizer
	# nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
	# nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
	# nli_model.to("cpu")
	# nli_model.eval()

	# # Load embedding model
	# embed_model = SentenceTransformer(EMBED_MODEL)

	# # Label mapping from config
	# id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}


	# # --------------------------
	# # METRIC FUNCTIONS
	# # --------------------------
	# def check_instruction_following(prompt: str, response: str) -> float:
	# """Embedding-based similarity between prompt and response."""
	# if not prompt or not response:
	# return 0.0
	# p_emb = embed_model.encode(prompt, convert_to_tensor=True)
	# r_emb = embed_model.encode(response, convert_to_tensor=True)
	# sim = float(util.cos_sim(p_emb, r_emb).item())
	# return round(max(0.0, min(1.0, sim)), 3)


	# def check_hallucination(reference: str, response: str) -> float:
	# """
	# Single hallucination score:
	# Entailment prob - Contradiction prob (normalized to [0,1]).
	# Higher = less hallucination.
	# """
	# if not reference or not response:
	# return 0.0
	# with torch.no_grad():
	# inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
	# outputs = nli_model(**inputs)
	# probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

	# entail_prob, contra_prob = 0.0, 0.0
	# for idx, p in enumerate(probs):
	# label = id2label.get(idx, "")
	# if "ENTAIL" in label:
	# entail_prob = float(p)
	# elif "CONTRA" in label:
	# contra_prob = float(p)

	# score = entail_prob - contra_prob
	# score = (score + 1) / 2 # normalize [-1,1] → [0,1]
	# return round(max(0.0, min(1.0, score)), 3)


	# def check_assumption(response: str) -> float:
	# """Detect speculative/hedging terms."""
	# if not response:
	# return 0.0
	# speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
	# count = sum(1 for t in speculative_terms if t in response.lower())
	# score = 1.0 - min(count / 5.0, 1.0) # smoother decay
	# return round(score, 3)


	# def check_coherence(response: str) -> float:
	# """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
	# if not response:
	# return 0.0
	# words = len(re.findall(r"\w+", response))
	# sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
	# if words < 5:
	# return 0.3
	# if words > 200:
	# return 0.5
	# base = min(1.0, (words / 50.0) + (sents / 5.0))
	# return round(max(0.4, min(base, 0.95)), 3)


	# def check_accuracy(reference: str, response: str) -> float:
	# """Semantic similarity between reference and response via embeddings (cosine)."""
	# if not reference or not response:
	# return 0.0
	# ref_emb = embed_model.encode(reference, convert_to_tensor=True)
	# resp_emb = embed_model.encode(response, convert_to_tensor=True)
	# sim = float(util.cos_sim(ref_emb, resp_emb).item())
	# return round(max(0.0, min(1.0, sim)), 3)


	# # --------------------------
	# # SCORING PIPELINE
	# # --------------------------
	# def compute_row_scores(prompt, response, reference) -> Dict:
	# instr = check_instruction_following(prompt, response)
	# halluc = check_hallucination(reference, response)
	# assum = check_assumption(response)
	# coh = check_coherence(response)
	# acc = check_accuracy(reference, response)

	# # Final score: average
	# components = [instr, halluc, assum, coh, acc]
	# final = round(float(sum(components) / len(components)), 3)

	# return {
	# "InstructionFollowing": instr,
	# "Hallucination": halluc,
	# "AssumptionControl": assum,
	# "Coherence": coh,
	# "Accuracy": acc,
	# "FinalScore": final,
	# }


	# # --------------------------
	# # VISUALIZATION HELPERS
	# # --------------------------
	# # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
	# # """Radar chart for multiple agents."""
	# # N = len(labels)
	# # angles = [n / float(N) * 2 * math.pi for n in range(N)]
	# # angles += angles[:1]

	# # fig = plt.figure(figsize=(6.5, 6.5))
	# # ax = plt.subplot(111, polar=True)
	# # ax.set_xticks(angles[:-1])
	# # ax.set_xticklabels(labels, fontsize=9)
	# # ax.set_ylim(0, 100)
	# # ax.set_yticks([0, 25, 50, 75, 100])

	# # for r in rows:
	# # values = r["values"]
	# # values_closed = values + values[:1]
	# # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
	# # ax.fill(angles, values_closed, alpha=fill_alpha)

	# # ax.set_title(title, y=1.08, fontsize=12)
	# # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
	# # return fig


	# # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
	# # fig, ax = plt.subplots(figsize=(7, 5))
	# # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
	# # ax.set_title(title)
	# # return fig


	# # --------------------------
	# # HIGH-LEVEL EVALUATION
	# # --------------------------
	# def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
	# """
	# df must contain: prompt, response, task, agent, reference
	# Returns: metrics_df, [(image_path, caption)], leaderboard_df
	# """
	# df = df.rename(columns={c: c.strip() for c in df.columns})

	# rows = []
	# for _, r in df.iterrows():
	# prompt = r.get("prompt", "")
	# response = r.get("response", "")
	# reference = r.get("reference", "")
	# agent = r.get("agent", "Unknown")
	# task = r.get("task", "Unknown")

	# scores = compute_row_scores(prompt, response, reference)
	# entry = {
	# "Task": str(task).strip(),
	# "Agent": str(agent),
	# "Prompt": prompt,
	# "Response": response,
	# "Reference": reference,
	# }
	# entry.update(scores)
	# rows.append(entry)

	# metrics_df = pd.DataFrame(rows)

	# # Visualization artifacts
	# images = []
	# metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]

	# # Per-task radar and bar charts
	# for task, g in metrics_df.groupby("Task"):
	# series = []
	# for a in g["Agent"].unique():
	# subset = g[g["Agent"] == a]
	# vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
	# series.append({"name": a, "values": vals})
	# if series:
	# fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
	# fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
	# fig.savefig(fname, bbox_inches="tight")
	# plt.close(fig)
	# images.append((fname, f"{task} - radar"))

	# fig2, ax = plt.subplots(figsize=(8, 4))
	# avg = g.groupby("Agent")[metric_labels].mean()
	# avg.plot(kind="bar", ax=ax)
	# ax.set_title(f"{task} — Average Metrics by Agent")
	# ax.set_ylabel("Score (0-1)")
	# plt.xticks(rotation=45)
	# fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
	# fig2.savefig(fname2, bbox_inches="tight")
	# plt.close(fig2)
	# images.append((fname2, f"{task} - bar"))

	# # Global heatmap
	# metric_cols = metric_labels + ["FinalScore"]
	# figh = heatmap_plot(metrics_df, metric_cols)
	# fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
	# figh.savefig(fnameh, bbox_inches="tight")
	# plt.close(figh)
	# images.append((fnameh, "Metric Correlations Heatmap"))

	# # Leaderboard
	# lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
	# lb = lb.sort_values(["FinalScore"], ascending=False)

	# return metrics_df, images, lb


	# # --------------------------
	# # DEMO USAGE
	# # --------------------------
	# if __name__ == "__main__":
	# # Sample dataset
	# data = [
	# {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
	# {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
	# {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
	# ]
	# df = pd.DataFrame(data)

	# metrics_df, images, leaderboard = evaluate_dataframe(df)

	# print("\n=== Metrics per response ===")
	# print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])

	# print("\n=== Leaderboard (average per task & agent) ===")
	# print(leaderboard)

	# print("\nVisualization files saved in /tmp/:")
	# for path, caption in images:
	# print(f"{caption}: {path}")

	import re
	import json
	import torch
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import os
	import uuid
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from sentence_transformers import SentenceTransformer, util

	# --------------------------
	# MODEL LOADING
	# --------------------------
	NLI_MODEL = "textattack/roberta-base-MNLI"
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	# Load NLI model & tokenizer
	nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
	nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
	nli_model.to("cpu")
	nli_model.eval()

	# Load embedding model
	embed_model = SentenceTransformer(EMBED_MODEL)

	# Label mapping from config
	id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}


	# --------------------------
	# METRIC FUNCTIONS
	# --------------------------
	def check_instruction_following(prompt: str, response: str) -> float:
	"""Embedding-based similarity between prompt and response."""
	if not prompt or not response:
	return 0.0
	p_emb = embed_model.encode(prompt, convert_to_tensor=True)
	r_emb = embed_model.encode(response, convert_to_tensor=True)
	sim = float(util.cos_sim(p_emb, r_emb).item())
	return round(max(0.0, min(1.0, sim)), 3)


	def check_hallucination(reference: str, response: str) -> float:
	"""
	Single hallucination score:
	Entailment prob - Contradiction prob (normalized to [0,1]).
	Higher = less hallucination.
	"""
	if not reference or not response:
	return 0.0
	with torch.no_grad():
	inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
	outputs = nli_model(**inputs)
	probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

	entail_prob, contra_prob = 0.0, 0.0
	for idx, p in enumerate(probs):
	label = id2label.get(idx, "")
	if "ENTAIL" in label:
	entail_prob = float(p)
	elif "CONTRA" in label:
	contra_prob = float(p)

	score = entail_prob - contra_prob
	score = (score + 1) / 2 # normalize [-1,1] → [0,1]
	return round(max(0.0, min(1.0, score)), 3)


	def check_assumption(response: str) -> float:
	"""Detect speculative/hedging terms."""
	if not response:
	return 0.0
	speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
	count = sum(1 for t in speculative_terms if t in response.lower())
	score = 1.0 - min(count / 5.0, 1.0) # smoother decay
	return round(score, 3)


	def check_coherence(response: str) -> float:
	"""Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
	if not response:
	return 0.0
	words = len(re.findall(r"\w+", response))
	sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
	if words < 5:
	return 0.3
	if words > 200:
	return 0.5
	base = min(1.0, (words / 50.0) + (sents / 5.0))
	return round(max(0.4, min(base, 0.95)), 3)


	def check_accuracy(reference: str, response: str) -> float:
	"""Semantic similarity between reference and response via embeddings (cosine)."""
	if not reference or not response:
	return 0.0
	ref_emb = embed_model.encode(reference, convert_to_tensor=True)
	resp_emb = embed_model.encode(response, convert_to_tensor=True)
	sim = float(util.cos_sim(ref_emb, resp_emb).item())
	return round(max(0.0, min(1.0, sim)), 3)


	# --------------------------
	# ROW & DF EVALUATION
	# --------------------------
	def evaluate_row(row):
	prompt = row.get("prompt", "")
	response = row.get("response", "")
	reference = row.get("reference", "")

	metrics = {
	"task_id": row.get("task_id", ""),
	"agent": row.get("agent", ""),
	"instruction_following": check_instruction_following(prompt, response),
	"hallucination": check_hallucination(reference, response),
	"assumption": check_assumption(response),
	"coherence": check_coherence(response),
	"accuracy": check_accuracy(reference, response),
	}

	# Weighted avg score (you can adjust weights)
	metrics["final_score"] = round(
	0.25 * metrics["instruction_following"]
	+ 0.25 * metrics["accuracy"]
	+ 0.2 * metrics["hallucination"]
	+ 0.15 * metrics["coherence"]
	+ 0.15 * metrics["assumption"],
	3,
	)
	return metrics


	def evaluate_dataframe(df: pd.DataFrame):
	metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")

	# Leaderboard
	leaderboard = (
	metrics_df.groupby(["agent", "task_id"])["final_score"]
	.mean()
	.reset_index()
	)

	# Plots
	images = []
	out_dir = "/tmp/plots"
	os.makedirs(out_dir, exist_ok=True)

	# Histogram of scores
	plt.figure(figsize=(6, 4))
	sns.histplot(metrics_df["final_score"], bins=10, kde=False)
	plt.title("Distribution of Final Scores")
	hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
	plt.savefig(hist_path)
	plt.close()
	images.append((hist_path, "Final Score Distribution"))

	# Per-agent average
	plt.figure(figsize=(6, 4))
	agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
	sns.barplot(data=agent_scores, x="agent", y="final_score")
	plt.title("Average Final Score per Agent")
	bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
	plt.savefig(bar_path)
	plt.close()
	images.append((bar_path, "Average Score per Agent"))

	return metrics_df, images, leaderboard