Supastrikas-004's picture
Update app.py
a8c2fc4 verified
import gradio as gr
import pandas as pd
import numpy as np
import json
import re
import torch
from sentence_transformers import SentenceTransformer, util
import matplotlib
matplotlib.use('Agg') # Use a non-interactive backend for Matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import base64
from io import BytesIO
from tqdm import tqdm
import tempfile # Added for handling temporary image files
# --- CONFIGURATION ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = 'all-MiniLM-L6-v2'
# --- INITIALIZATION ---
try:
print(f"Loading SentenceTransformer model '{MODEL_NAME}' onto {DEVICE}...")
model = SentenceTransformer(MODEL_NAME, device=DEVICE)
print("Model loaded successfully.")
except Exception as e:
print(f"Fatal Error: Could not load SentenceTransformer model: {e}")
model = None
# --- SCORING FUNCTIONS ---
def score_instruction_following(prompt, response):
"""Scores how well the response follows explicit formatting instructions in the prompt."""
prompt = prompt.lower()
response = str(response)
if "single number" in prompt:
if re.fullmatch(r"^\s*[-+]?\d+(\.\d+)?\s*$", response):
return 1.0, "Success: Followed 'single number' instruction."
return 0.0, "Failed: Did not provide a single number as instructed."
if "comma separated list" in prompt:
if ',' in response:
return 1.0, "Success: Followed 'comma separated list' instruction."
return 0.0, "Failed: Did not provide a comma-separated list."
if "few words" in prompt:
word_count = len(response.split())
if word_count <= 5:
return 1.0, "Success: Response was concise as requested."
score = max(0, 1 - (word_count - 5) * 0.1)
return score, f"Failed: Response was too verbose ({word_count} words)."
return 1.0, "No specific formatting instructions to evaluate against."
def score_hallucination_and_accuracy(response, ground_truth):
"""Scores factual accuracy by comparing response semantically to the ground truth."""
if model is None: return 0.0, "Skipped: SentenceTransformer model not loaded."
try:
embedding1 = model.encode(str(response), convert_to_tensor=True, device=DEVICE)
embedding2 = model.encode(str(ground_truth), convert_to_tensor=True, device=DEVICE)
similarity = util.cos_sim(embedding1, embedding2)
score = similarity.item()
reason = f"Response semantic similarity to ground truth is {score:.2f}."
if score < 0.5: reason += " (Low similarity suggests inaccuracy)."
elif score < 0.8: reason += " (Moderate similarity)."
else: reason += " (High similarity suggests accuracy)."
return score, reason
except Exception as e:
return 0.0, f"Error during semantic comparison: {e}"
def score_assumption_control(response):
"""Scores based on the presence of hedging or assumption-making language."""
response_lower = str(response).lower()
assumption_phrases = ["i assume", "i think", "probably", "likely", "it seems", "i believe", "i would guess", "it might be", "perhaps"]
found_phrases = [p for p in assumption_phrases if p in response_lower]
if not found_phrases:
return 1.0, "Success: No unwarranted assumption language detected."
score = max(0.0, 1.0 - 0.2 * len(found_phrases))
return score, f"Potential unwarranted assumptions detected. Found phrases: {', '.join(found_phrases)}."
def score_coherence(response):
"""Performs a basic coherence check based on length and content."""
response_str = str(response).strip()
if not response_str: return 0.0, "Failed: Response was empty."
word_count = len(response_str.split())
if word_count == 1 and len(response_str) < 3:
return 0.5, "Warning: Response is very short, may lack coherence."
return 1.0, "Success: Response has a coherent length."
# --- PLOTTING FUNCTIONS ---
def create_heatmap(leaderboard_df):
"""Generates a heatmap of agent scores and returns the plot object."""
if leaderboard_df.empty: return None
score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
heatmap_df = leaderboard_df.set_index('Agent Name')[score_columns].astype(float)
plt.style.use('dark_background')
fig, ax = plt.subplots(figsize=(10, max(6, len(heatmap_df) * 0.4)))
sns.heatmap(heatmap_df, annot=True, cmap="viridis", fmt=".2f", linewidths=.5, ax=ax, annot_kws={"color": "white", "size": 10})
ax.set_title('Agent Performance Heatmap', fontsize=16, color='white', pad=20)
ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
plt.xticks(rotation=45, ha="right"); plt.yticks(rotation=0)
fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a')
return fig
def create_spider_charts(leaderboard_df):
"""Generates spider charts, saves them as temp files, and returns a list of file paths."""
if leaderboard_df.empty: return []
score_columns = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
agents = leaderboard_df['Agent Name'].tolist()
charts = []
plt.style.use('dark_background')
for agent in agents:
agent_data = leaderboard_df[leaderboard_df['Agent Name'] == agent][score_columns].iloc[0]
values = agent_data.astype(float).tolist()
values += values[:1]
labels = np.array(score_columns)
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(polar=True))
fig.patch.set_facecolor('#1a1a1a'); ax.set_facecolor('#1a1a1a')
ax.fill(angles, values, color='cyan', alpha=0.25)
ax.plot(angles, values, color='cyan', linewidth=2)
ax.set_yticklabels([]); ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels, color='white')
ax.set_title(f"{agent} Performance", size=12, color='white', y=1.1)
# Save the figure to a temporary file and append the file path to the list
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmpfile:
fig.savefig(tmpfile.name, bbox_inches='tight')
charts.append(tmpfile.name)
plt.close(fig) # Close the figure to free up memory
return charts
# --- MAIN EVALUATION LOGIC ---
def evaluate_agents(submissions_file, ground_truth_file, progress=gr.Progress()):
"""
Main function to process files, score agents, and generate reports.
"""
if model is None:
raise gr.Error("The SentenceTransformer model could not be loaded. Please check the logs.")
try:
submissions = [json.loads(line) for line in open(submissions_file.name)]
ground_truths = [json.loads(line) for line in open(ground_truth_file.name)]
except Exception as e:
raise gr.Error(f"Error reading or parsing JSONL files: {e}")
ground_truth_lookup = {item['task_id']: item for item in ground_truths}
agent_scores = {}
for i, submission in enumerate(tqdm(submissions, desc="Evaluating Agent Responses")):
task_id = submission.get('task_id')
if not task_id or task_id not in ground_truth_lookup: continue
agent_name = "_".join(task_id.split('_')[:2])
if agent_name not in agent_scores:
agent_scores[agent_name] = {'scores': [], 'reasons': [], 'raw_data': []}
gt = ground_truth_lookup[task_id]
prompt = gt.get('prompt', '')
response = str(submission.get('model_answer', ''))
ground_truth_answer = gt.get('ground_truth_answer', '')
s_inst, r_inst = score_instruction_following(prompt, response)
s_acc, r_acc = score_hallucination_and_accuracy(response, ground_truth_answer)
s_ass, r_ass = score_assumption_control(response)
s_coh, r_coh = score_coherence(response)
current_scores = [float(s_inst), float(s_acc), float(s_ass), float(s_coh)]
current_reasons = [r_inst, r_acc, r_ass, r_coh]
agent_scores[agent_name]['scores'].append(current_scores)
agent_scores[agent_name]['reasons'].append(current_reasons)
agent_scores[agent_name]['raw_data'].append({'task_id': task_id, 'prompt': prompt, 'response': response, 'ground_truth': ground_truth_answer})
progress((i + 1) / len(submissions), desc=f"Processing {agent_name}")
if not agent_scores:
raise gr.Error("No valid agent data found. Check if task_ids match between files.")
report_data = []
detailed_report = "## Detailed Agent Evaluation Report\n\n"
score_labels = ["Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"]
for agent_name, data in agent_scores.items():
if not data['scores']: continue
avg_scores = pd.DataFrame(data['scores'], columns=score_labels).mean(axis=0)
overall_score = avg_scores.mean()
report_data.append([agent_name, overall_score] + avg_scores.tolist())
detailed_report += f"### Agent: {agent_name}\n**Overall Score: {overall_score:.2f}**\n\n"
for raw, scores, reasons in zip(data['raw_data'], data['scores'], data['reasons']):
detailed_report += f"**Task:** `{raw['task_id']}`\n- **Prompt:** *{raw['prompt']}*\n- **Response:** `{raw['response']}`\n- **Scores & Reasons:**\n"
detailed_report += f" - `Instruction Following`: {scores[0]:.2f} - *{reasons[0]}*\n"
detailed_report += f" - `Accuracy/Factuality`: {scores[1]:.2f} - *{reasons[1]}*\n"
detailed_report += f" - `Assumption Control`: {scores[2]:.2f} - *{reasons[2]}*\n"
detailed_report += f" - `Coherence`: {scores[3]:.2f} - *{reasons[3]}*\n\n"
columns = ["Agent Name", "Overall Score"] + score_labels
leaderboard_df = pd.DataFrame(report_data, columns=columns).sort_values(by="Overall Score", ascending=False)
heatmap_plot = create_heatmap(leaderboard_df)
spider_plots = create_spider_charts(leaderboard_df)
# Format the DataFrame for display after all calculations and plotting are done
display_df = leaderboard_df.copy()
for col in ["Overall Score"] + score_labels:
display_df[col] = display_df[col].map('{:.2f}'.format)
return display_df, heatmap_plot, spider_plots, detailed_report
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Default(primary_hue="cyan", secondary_hue="blue"), css="footer {display: none !important}") as demo:
gr.Markdown("# Scoring Framework for Large-Scale Agent Evaluation (AetherScore)")
gr.Markdown("Upload agent submission and ground truth files in `.jsonl` format to evaluate performance across multiple dimensions.")
with gr.Row():
with gr.Column(scale=1):
submissions_file = gr.File(label="Upload Submissions File (`submissions.jsonl`)")
ground_truth_file = gr.File(label="Upload Ground Truth File (`ground_truth.jsonl`)")
submit_btn = gr.Button("πŸš€ Evaluate Agents", variant="primary")
gr.Examples(examples=[["submissions.jsonl", "ground_truth.jsonl"]], inputs=[submissions_file, ground_truth_file], label="Sample Data")
with gr.Column(scale=3):
with gr.Tabs():
with gr.TabItem("πŸ† Leaderboard"):
leaderboard_output = gr.DataFrame(headers=["Agent Name", "Overall Score", "Instruction Following", "Accuracy/Factuality", "Assumption Control", "Coherence"], interactive=False)
with gr.TabItem("πŸ”₯ Heatmap"):
heatmap_output = gr.Plot(label="Agent Performance Heatmap")
with gr.TabItem("πŸ•ΈοΈ Spider Profiles"):
spider_output = gr.Gallery(label="Agent Performance Profiles", columns=4, object_fit="contain", height="auto")
with gr.TabItem("πŸ“œ Detailed Report"):
report_output = gr.Markdown()
submit_btn.click(
fn=evaluate_agents,
inputs=[submissions_file, ground_truth_file],
outputs=[leaderboard_output, heatmap_output, spider_output, report_output]
)
if __name__ == "__main__":
demo.launch()