import json, os, glob
import gradio as gr
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd
# ── constants ─────────────────────────────────────────────────────────────
IMPUTE_FIELDS = [
'recovered_material', 'recovered_object_type', 'recovered_condition',
'recovered_period', 'recovered_description'
]
FIELD_LABELS = {
'recovered_material': 'Material',
'recovered_object_type': 'Object Type',
'recovered_condition': 'Condition',
'recovered_period': 'Period',
'recovered_description': 'Description',
}
METRICS = {
'exact_match': 'Exact Match',
'fuzzy_token_sort': 'Fuzzy Match',
'semantic_sim': 'Semantic Similarity',
'top3_match': 'Top-3 Match',
'bleu': 'BLEU (description only)',
}
COLORS = ['#7D3A10', '#2d6a4f', '#1848A0', '#e9c46a', '#993556']
# ── load all eval jsons ───────────────────────────────────────────────────
def get_eval_files():
return sorted(glob.glob('*.json') + glob.glob('eval_results*.json'))
def load_eval(path):
with open(path) as f:
return json.load(f)
def friendly_name(path):
n = os.path.basename(path).replace('eval_results','').replace('.json','').strip('_- ')
return n if n else os.path.basename(path)
eval_files = get_eval_files()
eval_data = {friendly_name(f): load_eval(f) for f in eval_files}
# ── TAB 1: metrics dashboard ──────────────────────────────────────────────
def make_bar_chart(selected_runs, metric):
if not selected_runs:
return go.Figure()
fig = go.Figure()
for i, run in enumerate(selected_runs):
if run not in eval_data: continue
data = eval_data[run]['summary']
fields = list(data.keys())
vals = [data[f].get(metric, 0) for f in fields]
labels = [FIELD_LABELS.get(f, f) for f in fields]
fig.add_trace(go.Bar(
name=run, x=labels, y=vals,
marker_color=COLORS[i % len(COLORS)],
text=[f'{v:.1%}' for v in vals],
textposition='outside',
))
fig.update_layout(
barmode='group',
yaxis=dict(range=[0,1.15], tickformat='.0%', title='Score', gridcolor='#eee'),
xaxis_title='Field',
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Georgia, serif', size=13),
legend=dict(orientation='h', y=1.12),
margin=dict(t=60, b=40, l=40, r=20),
height=420,
)
return fig
def make_radar(selected_runs):
if not selected_runs:
return go.Figure()
cats = ['Exact Match','Fuzzy Match','Semantic Sim','Top-3 Match']
metric_keys = ['exact_match','fuzzy_token_sort','semantic_sim','top3_match']
fig = go.Figure()
for i, run in enumerate(selected_runs):
if run not in eval_data: continue
data = eval_data[run]['summary']
vals = []
for mk in metric_keys:
field_vals = [data[f].get(mk, 0) for f in IMPUTE_FIELDS if mk in data.get(f,{})]
vals.append(np.mean(field_vals) if field_vals else 0)
fig.add_trace(go.Scatterpolar(
r=vals + [vals[0]],
theta=cats + [cats[0]],
name=run,
line_color=COLORS[i % len(COLORS)],
fill='toself', fillcolor=COLORS[i % len(COLORS)],
opacity=0.2,
))
fig.update_layout(
polar=dict(radialaxis=dict(range=[0,1], tickformat='.0%')),
font=dict(family='Georgia, serif', size=12),
height=380,
margin=dict(t=40, b=40),
paper_bgcolor='white',
)
return fig
def make_summary_table(selected_runs):
if not selected_runs:
return pd.DataFrame()
rows = []
for run in selected_runs:
if run not in eval_data: continue
summary = eval_data[run]['summary']
for field, stats in summary.items():
row = {'Run': run, 'Field': FIELD_LABELS.get(field, field)}
for mk, ml in METRICS.items():
row[ml] = f"{stats.get(mk, 0):.1%}" if mk in stats else '—'
rows.append(row)
return pd.DataFrame(rows)
# ── TAB 2: artifact deep dive ─────────────────────────────────────────────
def make_confusion(run, field):
if not run or run not in eval_data: return go.Figure()
results = eval_data[run].get('results', {}).get(field, [])
if not results: return go.Figure()
gts = [r['gt'][:35] for r in results]
preds = [str(r['pred'])[:35] for r in results]
labels = sorted(set(gts) | set(preds))
n = len(labels)
idx = {l: i for i, l in enumerate(labels)}
mat = np.zeros((n,n), dtype=int)
for g, p in zip(gts, preds):
if g in idx and p in idx:
mat[idx[g]][idx[p]] += 1
fig = go.Figure(go.Heatmap(
z=mat, x=labels, y=labels,
colorscale='YlOrRd',
text=mat, texttemplate='%{text}',
))
fig.update_layout(
xaxis_title='Predicted', yaxis_title='Ground Truth',
height=max(380, n*28),
font=dict(family='Georgia, serif', size=11),
margin=dict(t=20, b=80, l=120, r=20),
paper_bgcolor='white',
)
return fig
def make_scatter(run, field):
if not run or run not in eval_data: return go.Figure()
results = eval_data[run].get('results', {}).get(field, [])
if not results: return go.Figure()
x = [r.get('fuzzy_token_sort', 0) for r in results]
y = [r.get('semantic_sim', 0) for r in results]
em = [r.get('exact_match', False) for r in results]
hover = [f"{r['label']}
GT: {r['gt'][:50]}
PRED: {str(r['pred'])[:50]}" for r in results]
colors_pt = ['#2d6a4f' if e else '#e76f51' for e in em]
fig = go.Figure(go.Scatter(
x=x, y=y, mode='markers',
marker=dict(color=colors_pt, size=9, opacity=0.75, line=dict(width=0.5, color='white')),
text=hover, hoverinfo='text',
))
fig.add_shape(type='line', x0=0,y0=0,x1=1,y1=1, line=dict(dash='dot', color='#aaa', width=1))
fig.update_layout(
xaxis=dict(title='Fuzzy match', range=[0,1.05], gridcolor='#eee'),
yaxis=dict(title='Semantic similarity', range=[0,1.05], gridcolor='#eee'),
height=360,
plot_bgcolor='white', paper_bgcolor='white',
font=dict(family='Georgia, serif', size=12),
margin=dict(t=20, b=40),
)
return fig
def make_error_table(run, field):
if not run or run not in eval_data: return pd.DataFrame()
results = eval_data[run].get('results', {}).get(field, [])
errors = [r for r in results if not r.get('exact_match', False)]
rows = []
for r in errors:
rows.append({
'Label': r['label'],
'Class': r.get('item_class',''),
'Project': r.get('project','')[:40],
'GT': r['gt'][:60],
'Predicted':str(r['pred'])[:60],
'Sem Sim': f"{r.get('semantic_sim',0):.2f}",
'Fuzzy': f"{r.get('fuzzy_token_sort',0):.2f}",
})
return pd.DataFrame(rows)
# ── TAB 3: per-artifact browser ───────────────────────────────────────────
def get_all_artifacts(run, field, only_errors):
if not run or run not in eval_data: return [], []
results = eval_data[run].get('results', {}).get(field, [])
if only_errors:
results = [r for r in results if not r.get('exact_match', False)]
choices = [f"{r['label']} | {r.get('item_class','')} | {r.get('project','')[:30]}" for r in results]
return choices, results
_artifact_cache = {}
def search_artifacts(run, field, only_errors, query):
choices, results = get_all_artifacts(run, field, only_errors)
_artifact_cache['results'] = results
_artifact_cache['choices'] = choices
if query:
filtered = [(c, r) for c, r in zip(choices, results)
if query.lower() in c.lower() or query.lower() in r['gt'].lower()]
choices = [x[0] for x in filtered]
_artifact_cache['results'] = [x[1] for x in filtered]
_artifact_cache['choices'] = choices
return gr.Dropdown(choices=choices, value=choices[0] if choices else None)
def show_artifact_card(selection):
if not selection or 'results' not in _artifact_cache:
return '
Select an artifact above
' choices = _artifact_cache['choices'] results = _artifact_cache['results'] if selection not in choices: return 'Not found
' r = results[choices.index(selection)] em = r.get('exact_match', False) fuzz = r.get('fuzzy_token_sort', 0) sem = r.get('semantic_sim', 0) top3 = r.get('top3', []) bleu = r.get('bleu', None) gt = r['gt'] pred = str(r['pred']) field = list(eval_data[list(eval_data.keys())[0]]['results'].keys())[0] status_color = '#2d6a4f' if em else '#e76f51' status_text = 'Exact match' if em else 'No exact match' top3_html = '' if top3: top3_html = '{r.get("item_class","")} · {r.get("project","")}
| Field | Value |
|---|---|
| Ground Truth | {gt} |
| Predicted | {pred} |
Search for artifacts above
') ab_search.click( search_artifacts, inputs=[ab_run, ab_field, ab_errors, ab_query], outputs=[ab_select], ) ab_select.change(show_artifact_card, inputs=[ab_select], outputs=[ab_card]) # ── TAB 4: about ─────────────────────────────────────────────────── with gr.Tab('About'): gr.Markdown(""" ## Pipeline Architecture **Encoding:** Each v4 artifact is encoded as a 1408-dim vector by concatenating: - DINOv2 ViT-L/14 image embedding (1024-dim) from the artifact's photograph - all-MiniLM-L6-v2 text embedding (384-dim) from concatenated metadata fields **Index:** FAISS flat index (IndexFlatIP) built on the 85% train split of v4 (19,215 artifacts). The remaining 15% (3,392 artifacts) are held out as the eval set. **Retrieval:** For each eval artifact, the top-N most similar artifacts are retrieved from the index, filtered to only those that have the target field populated. **Generation:** GPT-4o receives the artifact image + available fields + up to N retrieved neighbors as structured JSON context, plus a constrained vocabulary derived from the train split. ## Eval Setup - 85/15 stratified split of v4 by `(project_label, item_class_label)` - 100 artifacts sampled per field from the eval split - Each field evaluated independently — the target field is blanked and predicted - **Runs compared:** top-15 neighbors vs top-50 neighbors passed to GPT-4o ## Metrics | Metric | Description | |---|---| | Exact Match | Strict case-insensitive string equality | | Fuzzy Match | Token sort ratio (handles word order variation) | | Semantic Similarity | Cosine similarity of sentence embeddings | | Top-3 Match | Ground truth appears in model's top-3 candidates | | BLEU | N-gram overlap — description field only | Urmi Dedhia · CMU · April 2026 · ArchAIa Project """) # load defaults on start demo.load( lambda: update_overview(run_names, 'exact_match'), outputs=[bar_chart, radar_chart, summary_table] ) demo.load( lambda: update_deepdive(run_names[0], 'recovered_material'), outputs=[scatter, conf_m, error_table] ) if __name__ == '__main__': demo.launch() EOF