File size: 12,302 Bytes
03c544b c0e430a 03c544b e84340b 03c544b c0e430a e84340b a8c2fc4 03c544b c0e430a 03c544b c0e430a e84340b c0e430a 03c544b c0e430a 03c544b c0e430a e84340b c0e430a e84340b c0e430a e84340b c0e430a e84340b c0e430a e84340b 03c544b e84340b c0e430a e84340b c0e430a e84340b c0e430a 03c544b e84340b c0e430a e84340b c0e430a e84340b 03c544b e84340b c0e430a e84340b 03c544b c0e430a 08cb92a e84340b 03c544b e84340b c0e430a e84340b c0e430a e84340b c0e430a 08cb92a c0e430a a8c2fc4 e84340b 03c544b c0e430a e84340b c0e430a e84340b c0e430a 03c544b c0e430a e84340b c0e430a e84340b c0e430a a8c2fc4 c0e430a 03c544b c0e430a 03c544b e84340b c0e430a 03c544b c0e430a e84340b c0e430a e84340b c0e430a e84340b c0e430a e84340b c0e430a e84340b 03c544b 08cb92a e84340b 08cb92a c0e430a 08cb92a c0e430a 08cb92a 03c544b c0e430a 03c544b c0e430a 03c544b e84340b c0e430a 08cb92a c0e430a 08cb92a c0e430a 03c544b c0e430a 03c544b e84340b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | import gradio as gr
import pandas as pd
import numpy as np
import json
import re
import torch
from sentence_transformers import SentenceTransformer, util
import matplotlib
matplotlib.use('Agg') # Use a non-interactive backend for Matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import base64
from io import BytesIO
from tqdm import tqdm
import tempfile # Added for handling temporary image files
# --- CONFIGURATION ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = 'all-MiniLM-L6-v2'
# --- INITIALIZATION ---
try:
print(f"Loading SentenceTransformer model '{MODEL_NAME}' onto {DEVICE}...")
model = SentenceTransformer(MODEL_NAME, device=DEVICE)
print("Model loaded successfully.")
except Exception as e:
print(f"Fatal Error: Could not load SentenceTransformer model: {e}")
model = None
# --- SCORING FUNCTIONS ---
def score_instruction_following(prompt, response):
"""Scores how well the response follows explicit formatting instructions in the prompt."""
prompt = prompt.lower()
response = str(response)
if "single number" in prompt:
if re.fullmatch(r"^\s*[-+]?\d+(\.\d+)?\s*$", response):
return 1.0, "Success: Followed 'single number' instruction."
return 0.0, "Failed: Did not provide a single number as instructed."
if "comma separated list" in prompt:
if ',' in response:
return 1.0, "Success: Followed 'comma separated list' instruction."
return 0.0, "Failed: Did not provide a comma-separated list."
if "few words" in prompt:
word_count = len(response.split())
if word_count <= 5:
return 1.0, "Success: Response was concise as requested."
score = max(0, 1 - (word_count - 5) * 0.1)
return score, f"Failed: Response was too verbose ({word_count} words)."
return 1.0, "No specific formatting instructions to evaluate against."
def score_hallucination_and_accuracy(response, ground_truth):
"""Scores factual accuracy by comparing response semantically to the ground truth."""
if model is None: return 0.0, "Skipped: SentenceTransformer model not loaded."
try:
embedding1 = model.encode(str(response), convert_to_tensor=True, device=DEVICE)
embedding2 = model.encode(str(ground_truth), convert_to_tensor=True, device=DEVICE)
similarity = util.cos_sim(embedding1, embedding2)
score = similarity.item()
reason = f"Response semantic similarity to ground truth is {score:.2f}."
if score < 0.5: reason += " (Low similarity suggests inaccuracy)."
elif score < 0.8: reason += " (Moderate similarity)."
else: reason += " (High similarity suggests accuracy)."
return score, reason
except Exception as e:
return 0.0, f"Error during semantic comparison: {e}"
def score_assumption_control(response):
"""Scores based on the presence of hedging or assumption-making language."""
response_lower = str(response).lower()
assumption_phrases = ["i assume", "i think", "probably", "likely", "it seems", "i believe", "i would guess", "it might be", "perhaps"]
found_phrases = [p for p in assumption_phrases if p in response_lower]
if not found_phrases:
return 1.0, "Success: No unwarranted assumption language detected."
score = max(0.0, 1.0 - 0.2 * len(found_phrases))
return score, f"Potential unwarranted assumptions detected. Found phrases: {', '.join(found_phrases)}."
def score_coherence(response):
"""Performs a basic coherence check based on length and content."""
response_str = str(response).strip()
if not response_str: return 0.0, "Failed: Response was empty."
word_count = len(response_str.split())
if word_count == 1 and len(response_str) < 3:
return 0.5, "Warning: Response is very short, may lack coherence."
return 1.0, "Success: Response has a coherent length."
# --- PLOTTING FUNCTIONS ---
def create_heatmap(leaderboard_df):
"""Generates a heatmap of agent scores and returns the plot object."""
if leaderboard_df.empty: return None
score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
heatmap_df = leaderboard_df.set_index('Agent Name')[score_columns].astype(float)
plt.style.use('dark_background')
fig, ax = plt.subplots(figsize=(10, max(6, len(heatmap_df) * 0.4)))
sns.heatmap(heatmap_df, annot=True, cmap="viridis", fmt=".2f", linewidths=.5, ax=ax, annot_kws={"color": "white", "size": 10})
ax.set_title('Agent Performance Heatmap', fontsize=16, color='white', pad=20)
ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
plt.xticks(rotation=45, ha="right"); plt.yticks(rotation=0)
fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a')
return fig
def create_spider_charts(leaderboard_df):
"""Generates spider charts, saves them as temp files, and returns a list of file paths."""
if leaderboard_df.empty: return []
score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
agents = leaderboard_df['Agent Name'].tolist()
charts = []
plt.style.use('dark_background')
for agent in agents:
agent_data = leaderboard_df[leaderboard_df['Agent Name'] == agent][score_columns].iloc[0]
values = agent_data.astype(float).tolist()
values += values[:1]
labels = np.array(score_columns)
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(polar=True))
fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a')
ax.fill(angles, values, color='cyan', alpha=0.25)
ax.plot(angles, values, color='cyan', linewidth=2)
ax.set_yticklabels([]); ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels, color='white')
ax.set_title(f"{agent} Performance", size=12, color='white', y=1.1)
# Save the figure to a temporary file and append the file path to the list
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmpfile:
fig.savefig(tmpfile.name, bbox_inches='tight')
charts.append(tmpfile.name)
plt.close(fig) # Close the figure to free up memory
return charts
# --- MAIN EVALUATION LOGIC ---
def evaluate_agents(submissions_file, ground_truth_file, progress=gr.Progress()):
"""
Main function to process files, score agents, and generate reports.
"""
if model is None:
raise gr.Error("The SentenceTransformer model could not be loaded. Please check the logs.")
try:
submissions = [json.loads(line) for line in open(submissions_file.name)]
ground_truths = [json.loads(line) for line in open(ground_truth_file.name)]
except Exception as e:
raise gr.Error(f"Error reading or parsing JSONL files: {e}")
ground_truth_lookup = {item['task_id']: item for item in ground_truths}
agent_scores = {}
for i, submission in enumerate(tqdm(submissions, desc="Evaluating Agent Responses")):
task_id = submission.get('task_id')
if not task_id or task_id not in ground_truth_lookup: continue
agent_name = "_".join(task_id.split('_')[:2])
if agent_name not in agent_scores:
agent_scores[agent_name] = {'scores': [], 'reasons': [], 'raw_data': []}
gt = ground_truth_lookup[task_id]
prompt = gt.get('prompt', '')
response = str(submission.get('model_answer', ''))
ground_truth_answer = gt.get('ground_truth_answer', '')
s_inst, r_inst = score_instruction_following(prompt, response)
s_acc, r_acc = score_hallucination_and_accuracy(response, ground_truth_answer)
s_ass, r_ass = score_assumption_control(response)
s_coh, r_coh = score_coherence(response)
current_scores = [float(s_inst), float(s_acc), float(s_ass), float(s_coh)]
current_reasons = [r_inst, r_acc, r_ass, r_coh]
agent_scores[agent_name]['scores'].append(current_scores)
agent_scores[agent_name]['reasons'].append(current_reasons)
agent_scores[agent_name]['raw_data'].append({'task_id': task_id, 'prompt': prompt, 'response': response, 'ground_truth': ground_truth_answer})
progress((i + 1) / len(submissions), desc=f"Processing {agent_name}")
if not agent_scores:
raise gr.Error("No valid agent data found. Check if task_ids match between files.")
report_data = []
detailed_report = "## Detailed Agent Evaluation Report\n\n"
score_labels = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
for agent_name, data in agent_scores.items():
if not data['scores']: continue
avg_scores = pd.DataFrame(data['scores'], columns=score_labels).mean(axis=0)
overall_score = avg_scores.mean()
report_data.append([agent_name, overall_score] + avg_scores.tolist())
detailed_report += f"### Agent: {agent_name}\n**Overall Score: {overall_score:.2f}**\n\n"
for raw, scores, reasons in zip(data['raw_data'], data['scores'], data['reasons']):
detailed_report += f"**Task:** `{raw['task_id']}`\n- **Prompt:** *{raw['prompt']}*\n- **Response:** `{raw['response']}`\n- **Scores & Reasons:**\n"
detailed_report += f" - `Instruction Following`: {scores[0]:.2f} - *{reasons[0]}*\n"
detailed_report += f" - `Accuracy/Factuality`: {scores[1]:.2f} - *{reasons[1]}*\n"
detailed_report += f" - `Assumption Control`: {scores[2]:.2f} - *{reasons[2]}*\n"
detailed_report += f" - `Coherence`: {scores[3]:.2f} - *{reasons[3]}*\n\n"
columns = ["Agent Name", "Overall Score"] + score_labels
leaderboard_df = pd.DataFrame(report_data, columns=columns).sort_values(by="Overall Score", ascending=False)
heatmap_plot = create_heatmap(leaderboard_df)
spider_plots = create_spider_charts(leaderboard_df)
# Format the DataFrame for display after all calculations and plotting are done
display_df = leaderboard_df.copy()
for col in ["Overall Score"] + score_labels:
display_df[col] = display_df[col].map('{:.2f}'.format)
return display_df, heatmap_plot, spider_plots, detailed_report
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Default(primary_hue="cyan", secondary_hue="blue"), css="footer {display: none !important}") as demo:
gr.Markdown("# Scoring Framework for Large-Scale Agent Evaluation (AetherScore)")
gr.Markdown("Upload agent submission and ground truth files in `.jsonl` format to evaluate performance across multiple dimensions.")
with gr.Row():
with gr.Column(scale=1):
submissions_file = gr.File(label="Upload Submissions File (`submissions.jsonl`)")
ground_truth_file = gr.File(label="Upload Ground Truth File (`ground_truth.jsonl`)")
submit_btn = gr.Button("π Evaluate Agents", variant="primary")
gr.Examples(examples=[["submissions.jsonl", "ground_truth.jsonl"]], inputs=[submissions_file, ground_truth_file], label="Sample Data")
with gr.Column(scale=3):
with gr.Tabs():
with gr.TabItem("π Leaderboard"):
leaderboard_output = gr.DataFrame(headers=["Agent Name", "Overall Score", "Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"], interactive=False)
with gr.TabItem("π₯ Heatmap"):
heatmap_output = gr.Plot(label="Agent Performance Heatmap")
with gr.TabItem("πΈοΈ Spider Profiles"):
spider_output = gr.Gallery(label="Agent Performance Profiles", columns=4, object_fit="contain", height="auto")
with gr.TabItem("π Detailed Report"):
report_output = gr.Markdown()
submit_btn.click(
fn=evaluate_agents,
inputs=[submissions_file, ground_truth_file],
outputs=[leaderboard_output, heatmap_output, spider_output, report_output]
)
if __name__ == "__main__":
demo.launch()
|