Spaces:
Sleeping
Sleeping
File size: 4,412 Bytes
c4a10c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import json
import os
import pandas as pd
import gradio as gr
import plotly.graph_objects as go
from pathlib import Path
# --- Load Data ---
root_dir = Path(__file__).parent.absolute()
summary_path = root_dir / "summary_report.json"
scored_path = root_dir / "scored_responses.json"
summary_data = {
"behavioral_fingerprint": 0.0,
"dimensions": {}
}
scored_data = []
try:
if summary_path.exists():
with open(summary_path, "r", encoding="utf-8") as f:
summary_data = json.load(f)
else:
print(f"Warning: {summary_path} not found.")
if scored_path.exists():
with open(scored_path, "r", encoding="utf-8") as f:
scored_data = json.load(f)
else:
print(f"Warning: {scored_path} not found.")
except Exception as e:
print(f"Error loading data: {e}")
# --- Generate Plotly Figure ---
dimensions = list(summary_data.get("dimensions", {}).keys())
fig = go.Figure()
if dimensions:
model_a_scores = [summary_data["dimensions"][dim]["model_a"] for dim in dimensions]
model_b_scores = [summary_data["dimensions"][dim]["model_b"] for dim in dimensions]
if len(dimensions) < 3:
fig.add_trace(go.Bar(name='Model A', x=dimensions, y=model_a_scores))
fig.add_trace(go.Bar(name='Model B', x=dimensions, y=model_b_scores))
fig.update_layout(
barmode='group',
title="Behavioral Dimension Comparison (0 to 1 scale)",
yaxis=dict(range=[0, 1])
)
else:
dimensions_closed = dimensions + [dimensions[0]]
model_a_scores_closed = model_a_scores + [model_a_scores[0]]
model_b_scores_closed = model_b_scores + [model_b_scores[0]]
fig.add_trace(go.Scatterpolar(
r=model_a_scores_closed,
theta=dimensions_closed,
fill='toself',
name='Model A',
line_color='rgba(99, 110, 250, 0.8)',
fillcolor='rgba(99, 110, 250, 0.3)'
))
fig.add_trace(go.Scatterpolar(
r=model_b_scores_closed,
theta=dimensions_closed,
fill='toself',
name='Model B',
line_color='rgba(239, 85, 59, 0.8)',
fillcolor='rgba(239, 85, 59, 0.3)'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=True,
title="Behavioral Dimension Comparison (0 to 1 scale)",
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)",
)
# --- Generate Pandas DataFrame ---
if scored_data:
df = pd.DataFrame(scored_data)
# Extract only needed columns
cols_to_keep = ["dimension", "prompt", "model_a_response", "model_b_response", "distance"]
# Handle missing columns gracefully
for col in cols_to_keep:
if col not in df.columns:
df[col] = "N/A" if col != "distance" else 0.0
df = df[cols_to_keep]
df = df.rename(columns={
"dimension": "Dimension",
"prompt": "Prompt",
"model_a_response": "Model A Response",
"model_b_response": "Model B Response",
"distance": "Distance"
})
df = df.sort_values(by="Distance", ascending=False).reset_index(drop=True)
else:
df = pd.DataFrame(columns=["Dimension", "Prompt", "Model A Response", "Model B Response", "Distance"])
# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# LLM Diff: Behavioral Divergence Report")
gr.Markdown("### *A 'git diff' for model behavior.* Analyze how different versions of LLMs react to identical prompts.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Global Behavioral Fingerprint")
gr.Markdown(f"# {summary_data.get('behavioral_fingerprint', 0.0):.4f}")
gr.Markdown("*(Scale: 0.0 to 1.0. Higher = More Divergent)*")
with gr.Column(scale=2):
gr.Plot(value=fig)
gr.Markdown("---")
gr.Markdown("## Top Divergent Prompts")
gr.Markdown("Explore the specific prompts where the models disagreed the most.")
gr.Dataframe(
value=df,
wrap=True,
interactive=False,
row_count=(10, "dynamic")
)
if __name__ == "__main__":
demo.launch(share=True)
|