| | import gradio as gr |
| | import pandas as pd |
| | import numpy as np |
| | import json |
| | import re |
| | import torch |
| | from sentence_transformers import SentenceTransformer, util |
| | import matplotlib |
| | matplotlib.use('Agg') |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | import base64 |
| | from io import BytesIO |
| | from tqdm import tqdm |
| | import tempfile |
| |
|
| | |
| | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| | MODEL_NAME = 'all-MiniLM-L6-v2' |
| |
|
| | |
| | try: |
| | print(f"Loading SentenceTransformer model '{MODEL_NAME}' onto {DEVICE}...") |
| | model = SentenceTransformer(MODEL_NAME, device=DEVICE) |
| | print("Model loaded successfully.") |
| | except Exception as e: |
| | print(f"Fatal Error: Could not load SentenceTransformer model: {e}") |
| | model = None |
| |
|
| | |
| |
|
| | def score_instruction_following(prompt, response): |
| | """Scores how well the response follows explicit formatting instructions in the prompt.""" |
| | prompt = prompt.lower() |
| | response = str(response) |
| | |
| | if "single number" in prompt: |
| | if re.fullmatch(r"^\s*[-+]?\d+(\.\d+)?\s*$", response): |
| | return 1.0, "Success: Followed 'single number' instruction." |
| | return 0.0, "Failed: Did not provide a single number as instructed." |
| |
|
| | if "comma separated list" in prompt: |
| | if ',' in response: |
| | return 1.0, "Success: Followed 'comma separated list' instruction." |
| | return 0.0, "Failed: Did not provide a comma-separated list." |
| | |
| | if "few words" in prompt: |
| | word_count = len(response.split()) |
| | if word_count <= 5: |
| | return 1.0, "Success: Response was concise as requested." |
| | score = max(0, 1 - (word_count - 5) * 0.1) |
| | return score, f"Failed: Response was too verbose ({word_count} words)." |
| |
|
| | return 1.0, "No specific formatting instructions to evaluate against." |
| |
|
| | def score_hallucination_and_accuracy(response, ground_truth): |
| | """Scores factual accuracy by comparing response semantically to the ground truth.""" |
| | if model is None: return 0.0, "Skipped: SentenceTransformer model not loaded." |
| | |
| | try: |
| | embedding1 = model.encode(str(response), convert_to_tensor=True, device=DEVICE) |
| | embedding2 = model.encode(str(ground_truth), convert_to_tensor=True, device=DEVICE) |
| | similarity = util.cos_sim(embedding1, embedding2) |
| | score = similarity.item() |
| | |
| | reason = f"Response semantic similarity to ground truth is {score:.2f}." |
| | if score < 0.5: reason += " (Low similarity suggests inaccuracy)." |
| | elif score < 0.8: reason += " (Moderate similarity)." |
| | else: reason += " (High similarity suggests accuracy)." |
| | return score, reason |
| | except Exception as e: |
| | return 0.0, f"Error during semantic comparison: {e}" |
| |
|
| | def score_assumption_control(response): |
| | """Scores based on the presence of hedging or assumption-making language.""" |
| | response_lower = str(response).lower() |
| | assumption_phrases = ["i assume", "i think", "probably", "likely", "it seems", "i believe", "i would guess", "it might be", "perhaps"] |
| | found_phrases = [p for p in assumption_phrases if p in response_lower] |
| | |
| | if not found_phrases: |
| | return 1.0, "Success: No unwarranted assumption language detected." |
| | |
| | score = max(0.0, 1.0 - 0.2 * len(found_phrases)) |
| | return score, f"Potential unwarranted assumptions detected. Found phrases: {', '.join(found_phrases)}." |
| |
|
| | def score_coherence(response): |
| | """Performs a basic coherence check based on length and content.""" |
| | response_str = str(response).strip() |
| | if not response_str: return 0.0, "Failed: Response was empty." |
| | |
| | word_count = len(response_str.split()) |
| | if word_count == 1 and len(response_str) < 3: |
| | return 0.5, "Warning: Response is very short, may lack coherence." |
| | return 1.0, "Success: Response has a coherent length." |
| |
|
| | |
| |
|
| | def create_heatmap(leaderboard_df): |
| | """Generates a heatmap of agent scores and returns the plot object.""" |
| | if leaderboard_df.empty: return None |
| | |
| | score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"] |
| | heatmap_df = leaderboard_df.set_index('Agent Name')[score_columns].astype(float) |
| |
|
| | plt.style.use('dark_background') |
| | fig, ax = plt.subplots(figsize=(10, max(6, len(heatmap_df) * 0.4))) |
| | sns.heatmap(heatmap_df, annot=True, cmap="viridis", fmt=".2f", linewidths=.5, ax=ax, annot_kws={"color": "white", "size": 10}) |
| | ax.set_title('Agent Performance Heatmap', fontsize=16, color='white', pad=20) |
| | ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white') |
| | plt.xticks(rotation=45, ha="right"); plt.yticks(rotation=0) |
| | fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a') |
| |
|
| | return fig |
| |
|
| | def create_spider_charts(leaderboard_df): |
| | """Generates spider charts, saves them as temp files, and returns a list of file paths.""" |
| | if leaderboard_df.empty: return [] |
| | |
| | score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"] |
| | agents = leaderboard_df['Agent Name'].tolist() |
| | charts = [] |
| | |
| | plt.style.use('dark_background') |
| | for agent in agents: |
| | agent_data = leaderboard_df[leaderboard_df['Agent Name'] == agent][score_columns].iloc[0] |
| | values = agent_data.astype(float).tolist() |
| | values += values[:1] |
| | |
| | labels = np.array(score_columns) |
| | angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist() |
| | angles += angles[:1] |
| |
|
| | fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(polar=True)) |
| | fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a') |
| | ax.fill(angles, values, color='cyan', alpha=0.25) |
| | ax.plot(angles, values, color='cyan', linewidth=2) |
| | ax.set_yticklabels([]); ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels, color='white') |
| | ax.set_title(f"{agent} Performance", size=12, color='white', y=1.1) |
| |
|
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmpfile: |
| | fig.savefig(tmpfile.name, bbox_inches='tight') |
| | charts.append(tmpfile.name) |
| | |
| | plt.close(fig) |
| | |
| | return charts |
| |
|
| | |
| |
|
| | def evaluate_agents(submissions_file, ground_truth_file, progress=gr.Progress()): |
| | """ |
| | Main function to process files, score agents, and generate reports. |
| | """ |
| | if model is None: |
| | raise gr.Error("The SentenceTransformer model could not be loaded. Please check the logs.") |
| |
|
| | try: |
| | submissions = [json.loads(line) for line in open(submissions_file.name)] |
| | ground_truths = [json.loads(line) for line in open(ground_truth_file.name)] |
| | except Exception as e: |
| | raise gr.Error(f"Error reading or parsing JSONL files: {e}") |
| |
|
| | ground_truth_lookup = {item['task_id']: item for item in ground_truths} |
| | agent_scores = {} |
| | |
| | for i, submission in enumerate(tqdm(submissions, desc="Evaluating Agent Responses")): |
| | task_id = submission.get('task_id') |
| | if not task_id or task_id not in ground_truth_lookup: continue |
| |
|
| | agent_name = "_".join(task_id.split('_')[:2]) |
| | if agent_name not in agent_scores: |
| | agent_scores[agent_name] = {'scores': [], 'reasons': [], 'raw_data': []} |
| |
|
| | gt = ground_truth_lookup[task_id] |
| | prompt = gt.get('prompt', '') |
| | response = str(submission.get('model_answer', '')) |
| | ground_truth_answer = gt.get('ground_truth_answer', '') |
| |
|
| | s_inst, r_inst = score_instruction_following(prompt, response) |
| | s_acc, r_acc = score_hallucination_and_accuracy(response, ground_truth_answer) |
| | s_ass, r_ass = score_assumption_control(response) |
| | s_coh, r_coh = score_coherence(response) |
| |
|
| | current_scores = [float(s_inst), float(s_acc), float(s_ass), float(s_coh)] |
| | current_reasons = [r_inst, r_acc, r_ass, r_coh] |
| | |
| | agent_scores[agent_name]['scores'].append(current_scores) |
| | agent_scores[agent_name]['reasons'].append(current_reasons) |
| | agent_scores[agent_name]['raw_data'].append({'task_id': task_id, 'prompt': prompt, 'response': response, 'ground_truth': ground_truth_answer}) |
| | |
| | progress((i + 1) / len(submissions), desc=f"Processing {agent_name}") |
| |
|
| | if not agent_scores: |
| | raise gr.Error("No valid agent data found. Check if task_ids match between files.") |
| |
|
| | report_data = [] |
| | detailed_report = "## Detailed Agent Evaluation Report\n\n" |
| | score_labels = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"] |
| |
|
| | for agent_name, data in agent_scores.items(): |
| | if not data['scores']: continue |
| | avg_scores = pd.DataFrame(data['scores'], columns=score_labels).mean(axis=0) |
| | overall_score = avg_scores.mean() |
| | report_data.append([agent_name, overall_score] + avg_scores.tolist()) |
| |
|
| | detailed_report += f"### Agent: {agent_name}\n**Overall Score: {overall_score:.2f}**\n\n" |
| | for raw, scores, reasons in zip(data['raw_data'], data['scores'], data['reasons']): |
| | detailed_report += f"**Task:** `{raw['task_id']}`\n- **Prompt:** *{raw['prompt']}*\n- **Response:** `{raw['response']}`\n- **Scores & Reasons:**\n" |
| | detailed_report += f" - `Instruction Following`: {scores[0]:.2f} - *{reasons[0]}*\n" |
| | detailed_report += f" - `Accuracy/Factuality`: {scores[1]:.2f} - *{reasons[1]}*\n" |
| | detailed_report += f" - `Assumption Control`: {scores[2]:.2f} - *{reasons[2]}*\n" |
| | detailed_report += f" - `Coherence`: {scores[3]:.2f} - *{reasons[3]}*\n\n" |
| |
|
| | columns = ["Agent Name", "Overall Score"] + score_labels |
| | leaderboard_df = pd.DataFrame(report_data, columns=columns).sort_values(by="Overall Score", ascending=False) |
| | |
| | heatmap_plot = create_heatmap(leaderboard_df) |
| | spider_plots = create_spider_charts(leaderboard_df) |
| | |
| | |
| | display_df = leaderboard_df.copy() |
| | for col in ["Overall Score"] + score_labels: |
| | display_df[col] = display_df[col].map('{:.2f}'.format) |
| |
|
| | return display_df, heatmap_plot, spider_plots, detailed_report |
| |
|
| | |
| | with gr.Blocks(theme=gr.themes.Default(primary_hue="cyan", secondary_hue="blue"), css="footer {display: none !important}") as demo: |
| | gr.Markdown("# Scoring Framework for Large-Scale Agent Evaluation (AetherScore)") |
| | gr.Markdown("Upload agent submission and ground truth files in `.jsonl` format to evaluate performance across multiple dimensions.") |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | submissions_file = gr.File(label="Upload Submissions File (`submissions.jsonl`)") |
| | ground_truth_file = gr.File(label="Upload Ground Truth File (`ground_truth.jsonl`)") |
| | submit_btn = gr.Button("π Evaluate Agents", variant="primary") |
| | gr.Examples(examples=[["submissions.jsonl", "ground_truth.jsonl"]], inputs=[submissions_file, ground_truth_file], label="Sample Data") |
| |
|
| | with gr.Column(scale=3): |
| | with gr.Tabs(): |
| | with gr.TabItem("π Leaderboard"): |
| | leaderboard_output = gr.DataFrame(headers=["Agent Name", "Overall Score", "Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"], interactive=False) |
| | with gr.TabItem("π₯ Heatmap"): |
| | heatmap_output = gr.Plot(label="Agent Performance Heatmap") |
| | with gr.TabItem("πΈοΈ Spider Profiles"): |
| | spider_output = gr.Gallery(label="Agent Performance Profiles", columns=4, object_fit="contain", height="auto") |
| | with gr.TabItem("π Detailed Report"): |
| | report_output = gr.Markdown() |
| |
|
| | submit_btn.click( |
| | fn=evaluate_agents, |
| | inputs=[submissions_file, ground_truth_file], |
| | outputs=[leaderboard_output, heatmap_output, spider_output, report_output] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|
| |
|