import json, os, glob import gradio as gr import plotly.graph_objects as go import plotly.express as px import numpy as np import pandas as pd # ── constants ───────────────────────────────────────────────────────────── IMPUTE_FIELDS = [ 'recovered_material', 'recovered_object_type', 'recovered_condition', 'recovered_period', 'recovered_description' ] FIELD_LABELS = { 'recovered_material': 'Material', 'recovered_object_type': 'Object Type', 'recovered_condition': 'Condition', 'recovered_period': 'Period', 'recovered_description': 'Description', } METRICS = { 'exact_match': 'Exact Match', 'fuzzy_token_sort': 'Fuzzy Match', 'semantic_sim': 'Semantic Similarity', 'top3_match': 'Top-3 Match', 'bleu': 'BLEU (description only)', } COLORS = ['#7D3A10', '#2d6a4f', '#1848A0', '#e9c46a', '#993556'] # ── load all eval jsons ─────────────────────────────────────────────────── def get_eval_files(): return sorted(glob.glob('*.json') + glob.glob('eval_results*.json')) def load_eval(path): with open(path) as f: return json.load(f) def friendly_name(path): n = os.path.basename(path).replace('eval_results','').replace('.json','').strip('_- ') return n if n else os.path.basename(path) eval_files = get_eval_files() eval_data = {friendly_name(f): load_eval(f) for f in eval_files} # ── TAB 1: metrics dashboard ────────────────────────────────────────────── def make_bar_chart(selected_runs, metric): if not selected_runs: return go.Figure() fig = go.Figure() for i, run in enumerate(selected_runs): if run not in eval_data: continue data = eval_data[run]['summary'] fields = list(data.keys()) vals = [data[f].get(metric, 0) for f in fields] labels = [FIELD_LABELS.get(f, f) for f in fields] fig.add_trace(go.Bar( name=run, x=labels, y=vals, marker_color=COLORS[i % len(COLORS)], text=[f'{v:.1%}' for v in vals], textposition='outside', )) fig.update_layout( barmode='group', yaxis=dict(range=[0,1.15], tickformat='.0%', title='Score', gridcolor='#eee'), xaxis_title='Field', plot_bgcolor='white', paper_bgcolor='white', font=dict(family='Georgia, serif', size=13), legend=dict(orientation='h', y=1.12), margin=dict(t=60, b=40, l=40, r=20), height=420, ) return fig def make_radar(selected_runs): if not selected_runs: return go.Figure() cats = ['Exact Match','Fuzzy Match','Semantic Sim','Top-3 Match'] metric_keys = ['exact_match','fuzzy_token_sort','semantic_sim','top3_match'] fig = go.Figure() for i, run in enumerate(selected_runs): if run not in eval_data: continue data = eval_data[run]['summary'] vals = [] for mk in metric_keys: field_vals = [data[f].get(mk, 0) for f in IMPUTE_FIELDS if mk in data.get(f,{})] vals.append(np.mean(field_vals) if field_vals else 0) fig.add_trace(go.Scatterpolar( r=vals + [vals[0]], theta=cats + [cats[0]], name=run, line_color=COLORS[i % len(COLORS)], fill='toself', fillcolor=COLORS[i % len(COLORS)], opacity=0.2, )) fig.update_layout( polar=dict(radialaxis=dict(range=[0,1], tickformat='.0%')), font=dict(family='Georgia, serif', size=12), height=380, margin=dict(t=40, b=40), paper_bgcolor='white', ) return fig def make_summary_table(selected_runs): if not selected_runs: return pd.DataFrame() rows = [] for run in selected_runs: if run not in eval_data: continue summary = eval_data[run]['summary'] for field, stats in summary.items(): row = {'Run': run, 'Field': FIELD_LABELS.get(field, field)} for mk, ml in METRICS.items(): row[ml] = f"{stats.get(mk, 0):.1%}" if mk in stats else '—' rows.append(row) return pd.DataFrame(rows) # ── TAB 2: artifact deep dive ───────────────────────────────────────────── def make_confusion(run, field): if not run or run not in eval_data: return go.Figure() results = eval_data[run].get('results', {}).get(field, []) if not results: return go.Figure() gts = [r['gt'][:35] for r in results] preds = [str(r['pred'])[:35] for r in results] labels = sorted(set(gts) | set(preds)) n = len(labels) idx = {l: i for i, l in enumerate(labels)} mat = np.zeros((n,n), dtype=int) for g, p in zip(gts, preds): if g in idx and p in idx: mat[idx[g]][idx[p]] += 1 fig = go.Figure(go.Heatmap( z=mat, x=labels, y=labels, colorscale='YlOrRd', text=mat, texttemplate='%{text}', )) fig.update_layout( xaxis_title='Predicted', yaxis_title='Ground Truth', height=max(380, n*28), font=dict(family='Georgia, serif', size=11), margin=dict(t=20, b=80, l=120, r=20), paper_bgcolor='white', ) return fig def make_scatter(run, field): if not run or run not in eval_data: return go.Figure() results = eval_data[run].get('results', {}).get(field, []) if not results: return go.Figure() x = [r.get('fuzzy_token_sort', 0) for r in results] y = [r.get('semantic_sim', 0) for r in results] em = [r.get('exact_match', False) for r in results] hover = [f"{r['label']}
GT: {r['gt'][:50]}
PRED: {str(r['pred'])[:50]}" for r in results] colors_pt = ['#2d6a4f' if e else '#e76f51' for e in em] fig = go.Figure(go.Scatter( x=x, y=y, mode='markers', marker=dict(color=colors_pt, size=9, opacity=0.75, line=dict(width=0.5, color='white')), text=hover, hoverinfo='text', )) fig.add_shape(type='line', x0=0,y0=0,x1=1,y1=1, line=dict(dash='dot', color='#aaa', width=1)) fig.update_layout( xaxis=dict(title='Fuzzy match', range=[0,1.05], gridcolor='#eee'), yaxis=dict(title='Semantic similarity', range=[0,1.05], gridcolor='#eee'), height=360, plot_bgcolor='white', paper_bgcolor='white', font=dict(family='Georgia, serif', size=12), margin=dict(t=20, b=40), ) return fig def make_error_table(run, field): if not run or run not in eval_data: return pd.DataFrame() results = eval_data[run].get('results', {}).get(field, []) errors = [r for r in results if not r.get('exact_match', False)] rows = [] for r in errors: rows.append({ 'Label': r['label'], 'Class': r.get('item_class',''), 'Project': r.get('project','')[:40], 'GT': r['gt'][:60], 'Predicted':str(r['pred'])[:60], 'Sem Sim': f"{r.get('semantic_sim',0):.2f}", 'Fuzzy': f"{r.get('fuzzy_token_sort',0):.2f}", }) return pd.DataFrame(rows) # ── TAB 3: per-artifact browser ─────────────────────────────────────────── def get_all_artifacts(run, field, only_errors): if not run or run not in eval_data: return [], [] results = eval_data[run].get('results', {}).get(field, []) if only_errors: results = [r for r in results if not r.get('exact_match', False)] choices = [f"{r['label']} | {r.get('item_class','')} | {r.get('project','')[:30]}" for r in results] return choices, results _artifact_cache = {} def search_artifacts(run, field, only_errors, query): choices, results = get_all_artifacts(run, field, only_errors) _artifact_cache['results'] = results _artifact_cache['choices'] = choices if query: filtered = [(c, r) for c, r in zip(choices, results) if query.lower() in c.lower() or query.lower() in r['gt'].lower()] choices = [x[0] for x in filtered] _artifact_cache['results'] = [x[1] for x in filtered] _artifact_cache['choices'] = choices return gr.Dropdown(choices=choices, value=choices[0] if choices else None) def show_artifact_card(selection): if not selection or 'results' not in _artifact_cache: return '

Select an artifact above

' choices = _artifact_cache['choices'] results = _artifact_cache['results'] if selection not in choices: return '

Not found

' r = results[choices.index(selection)] em = r.get('exact_match', False) fuzz = r.get('fuzzy_token_sort', 0) sem = r.get('semantic_sim', 0) top3 = r.get('top3', []) bleu = r.get('bleu', None) gt = r['gt'] pred = str(r['pred']) field = list(eval_data[list(eval_data.keys())[0]]['results'].keys())[0] status_color = '#2d6a4f' if em else '#e76f51' status_text = 'Exact match' if em else 'No exact match' top3_html = '' if top3: top3_html = '
Top-3 candidates: ' + \ ' · '.join(f'{c}' for c in top3) + '
' bleu_html = f'BLEU: {bleu:.3f}' if bleu is not None else '' html = f'''

{r["label"]}

{r.get("item_class","")} · {r.get("project","")}

{status_text}
Field Value
Ground Truth {gt}
Predicted {pred}
{top3_html}
Scores: Fuzzy match: {fuzz:.2f} Semantic similarity: {sem:.2f} {bleu_html}
''' return html # ── build app ───────────────────────────────────────────────────────────── run_names = list(eval_data.keys()) field_choices = [(FIELD_LABELS[f], f) for f in IMPUTE_FIELDS] css = ''' .gr-button-primary { background: #7D3A10 !important; } h1, h2, h3 { font-family: Georgia, serif !important; } ''' with gr.Blocks( title='ArchAIa Imputation Eval', theme=gr.themes.Base(font=[gr.themes.GoogleFont('Source Serif 4'), 'Georgia', 'serif']), css=css, ) as demo: gr.Markdown(""" # ArchAIa — Field Imputation Evaluation Dashboard **CMU Language Technologies Institute · April 2026** Evaluation of a multimodal RAG pipeline (DINOv2 + MiniLM + GPT-4o) for filling missing metadata fields in archaeological artifacts from the OpenContext database. Compare results across different retrieval settings (top-15 vs top-50 neighbors). """) with gr.Tabs(): # ── TAB 1: metrics overview ──────────────────────────────────────── with gr.Tab('Metrics Overview'): with gr.Row(): run_selector = gr.CheckboxGroup( choices=run_names, value=run_names, label='Select eval runs to compare', ) metric_radio = gr.Radio( choices=list(METRICS.items()), value='exact_match', label='Metric', ) bar_chart = gr.Plot(label='Per-field scores by run') with gr.Row(): radar_chart = gr.Plot(label='Overall radar (mean across fields)') with gr.Column(): gr.Markdown("### Summary table") summary_table = gr.Dataframe(label='', wrap=True) def update_overview(runs, metric): return ( make_bar_chart(runs, metric), make_radar(runs), make_summary_table(runs), ) run_selector.change(update_overview, [run_selector, metric_radio], [bar_chart, radar_chart, summary_table]) metric_radio.change(update_overview, [run_selector, metric_radio], [bar_chart, radar_chart, summary_table]) # ── TAB 2: field deep dive ───────────────────────────────────────── with gr.Tab('Field Deep Dive'): gr.Markdown("Inspect per-artifact predictions for a specific field and run.") with gr.Row(): dd_run = gr.Dropdown(choices=run_names, value=run_names[0], label='Eval run') dd_field = gr.Dropdown(choices=field_choices, value='recovered_material', label='Field') with gr.Row(): scatter = gr.Plot(label='Fuzzy match vs Semantic similarity (green = exact match)') conf_m = gr.Plot(label='Confusion matrix') gr.Markdown("### Errors only") error_table = gr.Dataframe(label='Artifacts where exact match failed', wrap=True) def update_deepdive(run, field): return ( make_scatter(run, field), make_confusion(run, field), make_error_table(run, field), ) dd_run.change(update_deepdive, [dd_run, dd_field], [scatter, conf_m, error_table]) dd_field.change(update_deepdive, [dd_run, dd_field], [scatter, conf_m, error_table]) # ── TAB 3: artifact browser ──────────────────────────────────────── with gr.Tab('Artifact Browser'): gr.Markdown("Browse individual artifact predictions. Filter by run, field, and correct/incorrect.") with gr.Row(): ab_run = gr.Dropdown(choices=run_names, value=run_names[0], label='Eval run') ab_field = gr.Dropdown(choices=field_choices, value='recovered_material', label='Field') ab_errors = gr.Checkbox(value=False, label='Show errors only') ab_query = gr.Textbox(label='Search by label or ground truth', placeholder='e.g. Batch 5') ab_select = gr.Dropdown(label='Select artifact', choices=[], interactive=True) ab_search = gr.Button('Search / Refresh', variant='primary') ab_card = gr.HTML('

Search for artifacts above

') ab_search.click( search_artifacts, inputs=[ab_run, ab_field, ab_errors, ab_query], outputs=[ab_select], ) ab_select.change(show_artifact_card, inputs=[ab_select], outputs=[ab_card]) # ── TAB 4: about ─────────────────────────────────────────────────── with gr.Tab('About'): gr.Markdown(""" ## Pipeline Architecture **Encoding:** Each v4 artifact is encoded as a 1408-dim vector by concatenating: - DINOv2 ViT-L/14 image embedding (1024-dim) from the artifact's photograph - all-MiniLM-L6-v2 text embedding (384-dim) from concatenated metadata fields **Index:** FAISS flat index (IndexFlatIP) built on the 85% train split of v4 (19,215 artifacts). The remaining 15% (3,392 artifacts) are held out as the eval set. **Retrieval:** For each eval artifact, the top-N most similar artifacts are retrieved from the index, filtered to only those that have the target field populated. **Generation:** GPT-4o receives the artifact image + available fields + up to N retrieved neighbors as structured JSON context, plus a constrained vocabulary derived from the train split. ## Eval Setup - 85/15 stratified split of v4 by `(project_label, item_class_label)` - 100 artifacts sampled per field from the eval split - Each field evaluated independently — the target field is blanked and predicted - **Runs compared:** top-15 neighbors vs top-50 neighbors passed to GPT-4o ## Metrics | Metric | Description | |---|---| | Exact Match | Strict case-insensitive string equality | | Fuzzy Match | Token sort ratio (handles word order variation) | | Semantic Similarity | Cosine similarity of sentence embeddings | | Top-3 Match | Ground truth appears in model's top-3 candidates | | BLEU | N-gram overlap — description field only | Urmi Dedhia · CMU · April 2026 · ArchAIa Project """) # load defaults on start demo.load( lambda: update_overview(run_names, 'exact_match'), outputs=[bar_chart, radar_chart, summary_table] ) demo.load( lambda: update_deepdive(run_names[0], 'recovered_material'), outputs=[scatter, conf_m, error_table] ) if __name__ == '__main__': demo.launch() EOF