Spaces:

Supastrikas-004
/

AetherScore-Framework

Sleeping

App Files Files Community

AetherScore-Framework / app.py

Supastrikas-004

Update app.py

a8c2fc4 verified 6 months ago

raw

history blame contribute delete

12.3 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import json
	import re
	import torch
	from sentence_transformers import SentenceTransformer, util
	import matplotlib
	matplotlib.use('Agg') # Use a non-interactive backend for Matplotlib
	import matplotlib.pyplot as plt
	import seaborn as sns
	import base64
	from io import BytesIO
	from tqdm import tqdm
	import tempfile # Added for handling temporary image files

	# --- CONFIGURATION ---
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_NAME = 'all-MiniLM-L6-v2'

	# --- INITIALIZATION ---
	try:
	print(f"Loading SentenceTransformer model '{MODEL_NAME}' onto {DEVICE}...")
	model = SentenceTransformer(MODEL_NAME, device=DEVICE)
	print("Model loaded successfully.")
	except Exception as e:
	print(f"Fatal Error: Could not load SentenceTransformer model: {e}")
	model = None

	# --- SCORING FUNCTIONS ---

	def score_instruction_following(prompt, response):
	"""Scores how well the response follows explicit formatting instructions in the prompt."""
	prompt = prompt.lower()
	response = str(response)

	if "single number" in prompt:
	if re.fullmatch(r"^\s[-+]?\d+(\.\d+)?\s$", response):
	return 1.0, "Success: Followed 'single number' instruction."
	return 0.0, "Failed: Did not provide a single number as instructed."

	if "comma separated list" in prompt:
	if ',' in response:
	return 1.0, "Success: Followed 'comma separated list' instruction."
	return 0.0, "Failed: Did not provide a comma-separated list."

	if "few words" in prompt:
	word_count = len(response.split())
	if word_count <= 5:
	return 1.0, "Success: Response was concise as requested."
	score = max(0, 1 - (word_count - 5) * 0.1)
	return score, f"Failed: Response was too verbose ({word_count} words)."

	return 1.0, "No specific formatting instructions to evaluate against."

	def score_hallucination_and_accuracy(response, ground_truth):
	"""Scores factual accuracy by comparing response semantically to the ground truth."""
	if model is None: return 0.0, "Skipped: SentenceTransformer model not loaded."

	try:
	embedding1 = model.encode(str(response), convert_to_tensor=True, device=DEVICE)
	embedding2 = model.encode(str(ground_truth), convert_to_tensor=True, device=DEVICE)
	similarity = util.cos_sim(embedding1, embedding2)
	score = similarity.item()

	reason = f"Response semantic similarity to ground truth is {score:.2f}."
	if score < 0.5: reason += " (Low similarity suggests inaccuracy)."
	elif score < 0.8: reason += " (Moderate similarity)."
	else: reason += " (High similarity suggests accuracy)."
	return score, reason
	except Exception as e:
	return 0.0, f"Error during semantic comparison: {e}"

	def score_assumption_control(response):
	"""Scores based on the presence of hedging or assumption-making language."""
	response_lower = str(response).lower()
	assumption_phrases = ["i assume", "i think", "probably", "likely", "it seems", "i believe", "i would guess", "it might be", "perhaps"]
	found_phrases = [p for p in assumption_phrases if p in response_lower]

	if not found_phrases:
	return 1.0, "Success: No unwarranted assumption language detected."

	score = max(0.0, 1.0 - 0.2 * len(found_phrases))
	return score, f"Potential unwarranted assumptions detected. Found phrases: {', '.join(found_phrases)}."

	def score_coherence(response):
	"""Performs a basic coherence check based on length and content."""
	response_str = str(response).strip()
	if not response_str: return 0.0, "Failed: Response was empty."

	word_count = len(response_str.split())
	if word_count == 1 and len(response_str) < 3:
	return 0.5, "Warning: Response is very short, may lack coherence."
	return 1.0, "Success: Response has a coherent length."

	# --- PLOTTING FUNCTIONS ---

	def create_heatmap(leaderboard_df):
	"""Generates a heatmap of agent scores and returns the plot object."""
	if leaderboard_df.empty: return None

	score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
	heatmap_df = leaderboard_df.set_index('Agent Name')[score_columns].astype(float)

	plt.style.use('dark_background')
	fig, ax = plt.subplots(figsize=(10, max(6, len(heatmap_df) * 0.4)))
	sns.heatmap(heatmap_df, annot=True, cmap="viridis", fmt=".2f", linewidths=.5, ax=ax, annot_kws={"color": "white", "size": 10})
	ax.set_title('Agent Performance Heatmap', fontsize=16, color='white', pad=20)
	ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
	plt.xticks(rotation=45, ha="right"); plt.yticks(rotation=0)
	fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a')

	return fig

	def create_spider_charts(leaderboard_df):
	"""Generates spider charts, saves them as temp files, and returns a list of file paths."""
	if leaderboard_df.empty: return []

	score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
	agents = leaderboard_df['Agent Name'].tolist()
	charts = []

	plt.style.use('dark_background')
	for agent in agents:
	agent_data = leaderboard_df[leaderboard_df['Agent Name'] == agent][score_columns].iloc[0]
	values = agent_data.astype(float).tolist()
	values += values[:1]

	labels = np.array(score_columns)
	angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
	angles += angles[:1]

	fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(polar=True))
	fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a')
	ax.fill(angles, values, color='cyan', alpha=0.25)
	ax.plot(angles, values, color='cyan', linewidth=2)
	ax.set_yticklabels([]); ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels, color='white')
	ax.set_title(f"{agent} Performance", size=12, color='white', y=1.1)

	# Save the figure to a temporary file and append the file path to the list
	with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmpfile:
	fig.savefig(tmpfile.name, bbox_inches='tight')
	charts.append(tmpfile.name)

	plt.close(fig) # Close the figure to free up memory

	return charts

	# --- MAIN EVALUATION LOGIC ---

	def evaluate_agents(submissions_file, ground_truth_file, progress=gr.Progress()):
	"""
	Main function to process files, score agents, and generate reports.
	"""
	if model is None:
	raise gr.Error("The SentenceTransformer model could not be loaded. Please check the logs.")

	try:
	submissions = [json.loads(line) for line in open(submissions_file.name)]
	ground_truths = [json.loads(line) for line in open(ground_truth_file.name)]
	except Exception as e:
	raise gr.Error(f"Error reading or parsing JSONL files: {e}")

	ground_truth_lookup = {item['task_id']: item for item in ground_truths}
	agent_scores = {}

	for i, submission in enumerate(tqdm(submissions, desc="Evaluating Agent Responses")):
	task_id = submission.get('task_id')
	if not task_id or task_id not in ground_truth_lookup: continue

	agent_name = "_".join(task_id.split('_')[:2])
	if agent_name not in agent_scores:
	agent_scores[agent_name] = {'scores': [], 'reasons': [], 'raw_data': []}

	gt = ground_truth_lookup[task_id]
	prompt = gt.get('prompt', '')
	response = str(submission.get('model_answer', ''))
	ground_truth_answer = gt.get('ground_truth_answer', '')

	s_inst, r_inst = score_instruction_following(prompt, response)
	s_acc, r_acc = score_hallucination_and_accuracy(response, ground_truth_answer)
	s_ass, r_ass = score_assumption_control(response)
	s_coh, r_coh = score_coherence(response)

	current_scores = [float(s_inst), float(s_acc), float(s_ass), float(s_coh)]
	current_reasons = [r_inst, r_acc, r_ass, r_coh]

	agent_scores[agent_name]['scores'].append(current_scores)
	agent_scores[agent_name]['reasons'].append(current_reasons)
	agent_scores[agent_name]['raw_data'].append({'task_id': task_id, 'prompt': prompt, 'response': response, 'ground_truth': ground_truth_answer})

	progress((i + 1) / len(submissions), desc=f"Processing {agent_name}")

	if not agent_scores:
	raise gr.Error("No valid agent data found. Check if task_ids match between files.")

	report_data = []
	detailed_report = "## Detailed Agent Evaluation Report\n\n"
	score_labels = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]

	for agent_name, data in agent_scores.items():
	if not data['scores']: continue
	avg_scores = pd.DataFrame(data['scores'], columns=score_labels).mean(axis=0)
	overall_score = avg_scores.mean()
	report_data.append([agent_name, overall_score] + avg_scores.tolist())

	detailed_report += f"### Agent: {agent_name}\nOverall Score: {overall_score:.2f}\n\n"
	for raw, scores, reasons in zip(data['raw_data'], data['scores'], data['reasons']):
	detailed_report += f"Task: `{raw['task_id']}`\n- Prompt: {raw['prompt']}\n- Response: `{raw['response']}`\n- Scores & Reasons:\n"
	detailed_report += f" - `Instruction Following`: {scores[0]:.2f} - {reasons[0]}\n"
	detailed_report += f" - `Accuracy/Factuality`: {scores[1]:.2f} - {reasons[1]}\n"
	detailed_report += f" - `Assumption Control`: {scores[2]:.2f} - {reasons[2]}\n"
	detailed_report += f" - `Coherence`: {scores[3]:.2f} - {reasons[3]}\n\n"

	columns = ["Agent Name", "Overall Score"] + score_labels
	leaderboard_df = pd.DataFrame(report_data, columns=columns).sort_values(by="Overall Score", ascending=False)

	heatmap_plot = create_heatmap(leaderboard_df)
	spider_plots = create_spider_charts(leaderboard_df)

	# Format the DataFrame for display after all calculations and plotting are done
	display_df = leaderboard_df.copy()
	for col in ["Overall Score"] + score_labels:
	display_df[col] = display_df[col].map('{:.2f}'.format)

	return display_df, heatmap_plot, spider_plots, detailed_report

	# --- GRADIO UI ---
	with gr.Blocks(theme=gr.themes.Default(primary_hue="cyan", secondary_hue="blue"), css="footer {display: none !important}") as demo:
	gr.Markdown("# Scoring Framework for Large-Scale Agent Evaluation (AetherScore)")
	gr.Markdown("Upload agent submission and ground truth files in `.jsonl` format to evaluate performance across multiple dimensions.")

	with gr.Row():
	with gr.Column(scale=1):
	submissions_file = gr.File(label="Upload Submissions File (`submissions.jsonl`)")
	ground_truth_file = gr.File(label="Upload Ground Truth File (`ground_truth.jsonl`)")
	submit_btn = gr.Button("🚀 Evaluate Agents", variant="primary")
	gr.Examples(examples=[["submissions.jsonl", "ground_truth.jsonl"]], inputs=[submissions_file, ground_truth_file], label="Sample Data")

	with gr.Column(scale=3):
	with gr.Tabs():
	with gr.TabItem("🏆 Leaderboard"):
	leaderboard_output = gr.DataFrame(headers=["Agent Name", "Overall Score", "Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"], interactive=False)
	with gr.TabItem("🔥 Heatmap"):
	heatmap_output = gr.Plot(label="Agent Performance Heatmap")
	with gr.TabItem("🕸️ Spider Profiles"):
	spider_output = gr.Gallery(label="Agent Performance Profiles", columns=4, object_fit="contain", height="auto")
	with gr.TabItem("📜 Detailed Report"):
	report_output = gr.Markdown()

	submit_btn.click(
	fn=evaluate_agents,
	inputs=[submissions_file, ground_truth_file],
	outputs=[leaderboard_output, heatmap_output, spider_output, report_output]
	)

	if __name__ == "__main__":
	demo.launch()