Spaces:
Running
Running
Upload app.py with huggingface_hub
Browse files
app.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GF-Score: Fairness-Aware Robustness Auditing Dashboard
|
| 3 |
+
=======================================================
|
| 4 |
+
Hugging Face Spaces entry point.
|
| 5 |
+
|
| 6 |
+
Loads pre-computed evaluation results (no model inference required)
|
| 7 |
+
and serves an interactive Gradio dashboard.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import sys
|
| 11 |
+
import json
|
| 12 |
+
import logging
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
|
| 16 |
+
# Ensure repo root is on the path so gf_score package is importable
|
| 17 |
+
ROOT = Path(__file__).parent.resolve()
|
| 18 |
+
sys.path.insert(0, str(ROOT))
|
| 19 |
+
|
| 20 |
+
import gradio as gr
|
| 21 |
+
|
| 22 |
+
logging.basicConfig(level=logging.INFO)
|
| 23 |
+
logger = logging.getLogger("gf_score.hf_app")
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# Paths — resolved relative to repo root (works both locally and on HF)
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
RESULTS_DIR = ROOT / "outputs" / "results"
|
| 29 |
+
REPORTS_DIR = RESULTS_DIR / "reports"
|
| 30 |
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
# Model short-name mappings (copied from config to keep this file standalone)
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
CIFAR10_SHORT_NAMES = {
|
| 36 |
+
"Augustin2020Adversarial_34_10_extra": "Augustin_WRN_extra",
|
| 37 |
+
"Augustin2020Adversarial_34_10": "Augustin_WRN",
|
| 38 |
+
"Augustin2020Adversarial": "Augustin2020",
|
| 39 |
+
"Ding2020MMA": "Ding_MMA",
|
| 40 |
+
"Engstrom2019Robustness": "Engstrom2019",
|
| 41 |
+
"Gowal2020Uncovering": "Gowal2020",
|
| 42 |
+
"Gowal2020Uncovering_extra": "Gowal_extra",
|
| 43 |
+
"Rade2021Helper_R18_ddpm": "Rade_R18",
|
| 44 |
+
"Rebuffi2021Fixing_28_10_cutmix_ddpm": "Rebuffi_28_ddpm",
|
| 45 |
+
"Rebuffi2021Fixing_70_16_cutmix_ddpm": "Rebuffi_70_ddpm",
|
| 46 |
+
"Rebuffi2021Fixing_70_16_cutmix_extra":"Rebuffi_extra",
|
| 47 |
+
"Rebuffi2021Fixing_R18_cutmix_ddpm": "Rebuffi_R18",
|
| 48 |
+
"Rice2020Overfitting": "Rice2020",
|
| 49 |
+
"Rony2019Decoupling": "Rony2019",
|
| 50 |
+
"Sehwag2021Proxy": "Sehwag_Proxy",
|
| 51 |
+
"Sehwag2021Proxy_R18": "Sehwag_R18",
|
| 52 |
+
"Wu2020Adversarial": "Wu2020",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
IMAGENET_SHORT_NAMES = {
|
| 56 |
+
"Salman2020Do_50_2": "Salman_WRN50-2",
|
| 57 |
+
"Salman2020Do_R50": "Salman_R50",
|
| 58 |
+
"Engstrom2019Robustness": "Engstrom2019",
|
| 59 |
+
"Wong2020Fast": "Wong2020",
|
| 60 |
+
"Salman2020Do_R18": "Salman_R18",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
SHORT_NAMES = {"cifar10": CIFAR10_SHORT_NAMES, "imagenet": IMAGENET_SHORT_NAMES}
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Data loading
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def load_results(dataset: str):
|
| 70 |
+
suffix = f"_{dataset}" if dataset != "cifar10" else ""
|
| 71 |
+
path = RESULTS_DIR / f"full_results{suffix}.json"
|
| 72 |
+
if not path.exists():
|
| 73 |
+
logger.warning(f"Results file not found: {path}")
|
| 74 |
+
return None
|
| 75 |
+
with open(path, "r") as f:
|
| 76 |
+
return json.load(f)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_available_datasets():
|
| 80 |
+
available = []
|
| 81 |
+
if (RESULTS_DIR / "full_results.json").exists():
|
| 82 |
+
available.append("cifar10")
|
| 83 |
+
if (RESULTS_DIR / "full_results_imagenet.json").exists():
|
| 84 |
+
available.append("imagenet")
|
| 85 |
+
return available or ["cifar10"]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def get_model_choices(results, dataset: str):
|
| 89 |
+
if results is None:
|
| 90 |
+
return []
|
| 91 |
+
names = SHORT_NAMES.get(dataset, {})
|
| 92 |
+
return [names.get(m, m) for m in results["model_results"].keys()]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def display_name_to_full(display_name: str, results, dataset: str):
|
| 96 |
+
names = SHORT_NAMES.get(dataset, {})
|
| 97 |
+
for full, short in names.items():
|
| 98 |
+
if short == display_name and full in results["model_results"]:
|
| 99 |
+
return full
|
| 100 |
+
return display_name if display_name in results["model_results"] else None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_class_names(results):
|
| 104 |
+
if results is None:
|
| 105 |
+
return []
|
| 106 |
+
meta = results.get("metadata", {})
|
| 107 |
+
cls = meta.get("class_names")
|
| 108 |
+
if cls:
|
| 109 |
+
return cls
|
| 110 |
+
model_results = results.get("model_results", {})
|
| 111 |
+
if model_results:
|
| 112 |
+
first = next(iter(model_results.values()))
|
| 113 |
+
return list(first.get("per_class_scores", {}).keys())
|
| 114 |
+
return []
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ---------------------------------------------------------------------------
|
| 118 |
+
# Analysis
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
def analyze_model(model_display_name, lambda_val, dataset, results):
|
| 122 |
+
if results is None:
|
| 123 |
+
return (
|
| 124 |
+
"⚠️ **No results found.** The pre-computed evaluation files are missing.\n\n"
|
| 125 |
+
"Please ensure `outputs/results/full_results.json` (and `full_results_imagenet.json`) "
|
| 126 |
+
"are committed to the Space repository.",
|
| 127 |
+
"<p>No data available.</p>",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
full_name = display_name_to_full(model_display_name, results, dataset)
|
| 131 |
+
if full_name is None:
|
| 132 |
+
return f"Model `{model_display_name}` not found in results.", "<p>Not found.</p>"
|
| 133 |
+
|
| 134 |
+
r = results["model_results"][full_name]
|
| 135 |
+
class_names = get_class_names(results)
|
| 136 |
+
num_classes = len(class_names)
|
| 137 |
+
|
| 138 |
+
agg = r["aggregate_great_score"]
|
| 139 |
+
rdi = r["rdi"]
|
| 140 |
+
nrgc = r["nrgc"]
|
| 141 |
+
wcr = r["wcr"]
|
| 142 |
+
wcr_class = r.get("wcr_class", "—")
|
| 143 |
+
fp_great = agg - lambda_val * rdi
|
| 144 |
+
fp_at_0 = agg
|
| 145 |
+
fp_at_1 = agg - rdi
|
| 146 |
+
|
| 147 |
+
ds_label = "CIFAR-10" if dataset == "cifar10" else "ImageNet"
|
| 148 |
+
threat = "L2 (ε=0.5)" if dataset == "cifar10" else "L∞ (ε=4/255)"
|
| 149 |
+
|
| 150 |
+
rdi_icon = "✅ Low" if rdi < 0.1 else ("⚠️ Moderate" if rdi < 0.3 else "❌ High")
|
| 151 |
+
wcr_icon = "✅ Good" if wcr > 0.2 else ("⚠️ Low" if wcr > 0.05 else "❌ Critical")
|
| 152 |
+
fp_interp = (
|
| 153 |
+
"No fairness penalty (= aggregate GREAT Score)" if lambda_val == 0.0 else
|
| 154 |
+
"Mild fairness adjustment" if lambda_val < 0.3 else
|
| 155 |
+
"Balanced robustness-fairness trade-off" if lambda_val < 0.7 else
|
| 156 |
+
"Strong fairness emphasis"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# ---- Markdown analysis output ----
|
| 160 |
+
md = f"""## 🛡️ {model_display_name}
|
| 161 |
+
**Dataset:** {ds_label} | **Threat Model:** {threat} | **Classes:** {num_classes}
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
### Aggregate Metrics
|
| 166 |
+
|
| 167 |
+
| Metric | Value | Status |
|
| 168 |
+
|--------|------:|--------|
|
| 169 |
+
| **GREAT Score** (Ω̂) | `{agg:.4f}` | Certified robustness lower bound |
|
| 170 |
+
| **RDI** (Disparity) | `{rdi:.4f}` | {rdi_icon} |
|
| 171 |
+
| **NRGC** (Gini) | `{nrgc:.4f}` | Class inequality index ∈ [0, 1) |
|
| 172 |
+
| **WCR** (Worst-Case)| `{wcr:.4f}` | {wcr_icon} — worst class: `{wcr_class}` |
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
### 🎛️ Fairness-Penalized Score (FP-GREAT)
|
| 177 |
+
|
| 178 |
+
**FP-GREAT = Ω̄ − λ × RDI = {agg:.4f} − {lambda_val:.2f} × {rdi:.4f} = `{fp_great:.4f}`**
|
| 179 |
+
|
| 180 |
+
*{fp_interp}*
|
| 181 |
+
|
| 182 |
+
| λ | FP-GREAT | Meaning |
|
| 183 |
+
|---|----------:|---------|
|
| 184 |
+
| 0.00 | {fp_at_0:.4f} | Pure robustness (no penalty) |
|
| 185 |
+
| **{lambda_val:.2f}** | **{fp_great:.4f}** | ← Current |
|
| 186 |
+
| 1.00 | {fp_at_1:.4f} | Max fairness penalty |
|
| 187 |
+
|
| 188 |
+
---
|
| 189 |
+
|
| 190 |
+
### Per-Class Robustness Scores
|
| 191 |
+
"""
|
| 192 |
+
|
| 193 |
+
per_class = r.get("per_class_scores", {})
|
| 194 |
+
per_acc = r.get("per_class_accuracy", {})
|
| 195 |
+
max_score = max(per_class.values()) if per_class else 1.0
|
| 196 |
+
|
| 197 |
+
if num_classes > 30:
|
| 198 |
+
sorted_cls = sorted(per_class.keys(), key=lambda c: per_class.get(c, 0))
|
| 199 |
+
bottom10, top10 = sorted_cls[:10], sorted_cls[-10:]
|
| 200 |
+
|
| 201 |
+
md += f"*{num_classes} total classes — showing bottom 10 and top 10:*\n\n"
|
| 202 |
+
md += "**🔴 Bottom 10 — Most Vulnerable:**\n\n"
|
| 203 |
+
md += "| Class | Score | Accuracy | Bar |\n|-------|------:|----------:|-----|\n"
|
| 204 |
+
for cls in bottom10:
|
| 205 |
+
s = per_class.get(cls, 0)
|
| 206 |
+
a = per_acc.get(cls, 0)
|
| 207 |
+
bar = "█" * int(s / max(max_score, 0.001) * 15)
|
| 208 |
+
md += f"| `{cls}` | {s:.4f} | {a:.1%} | {bar} |\n"
|
| 209 |
+
|
| 210 |
+
md += "\n**🟢 Top 10 — Most Robust:**\n\n"
|
| 211 |
+
md += "| Class | Score | Accuracy | Bar |\n|-------|------:|----------:|-----|\n"
|
| 212 |
+
for cls in top10:
|
| 213 |
+
s = per_class.get(cls, 0)
|
| 214 |
+
a = per_acc.get(cls, 0)
|
| 215 |
+
bar = "█" * int(s / max(max_score, 0.001) * 15)
|
| 216 |
+
md += f"| `{cls}` | {s:.4f} | {a:.1%} | {bar} |\n"
|
| 217 |
+
else:
|
| 218 |
+
md += "| Class | Score | Accuracy | Bar |\n|-------|------:|----------:|-----|\n"
|
| 219 |
+
for cls in class_names:
|
| 220 |
+
s = per_class.get(cls, 0)
|
| 221 |
+
a = per_acc.get(cls, 0)
|
| 222 |
+
bar = "█" * int(s / max(max_score, 0.001) * 15)
|
| 223 |
+
md += f"| `{cls}` | {s:.4f} | {a:.1%} | {bar} |\n"
|
| 224 |
+
|
| 225 |
+
vuln = r.get("vulnerability_ranking", [])
|
| 226 |
+
if vuln:
|
| 227 |
+
display_vuln = vuln[:10] if num_classes > 30 else vuln
|
| 228 |
+
suffix_txt = f" (top 10 of {num_classes})" if num_classes > 30 else ""
|
| 229 |
+
md += f"\n### Vulnerability Ranking{suffix_txt}\n"
|
| 230 |
+
for rank, (cls, score) in enumerate(display_vuln, 1):
|
| 231 |
+
icon = "🔴" if rank <= 3 else ("🟡" if rank <= len(display_vuln) - 3 else "🟢")
|
| 232 |
+
md += f"{rank}. {icon} **`{cls}`**: {score:.4f}\n"
|
| 233 |
+
|
| 234 |
+
# ---- HTML audit report ----
|
| 235 |
+
html = _build_html_report(
|
| 236 |
+
model_display_name, r, ds_label, threat, num_classes,
|
| 237 |
+
class_names, per_class, per_acc, max_score, vuln,
|
| 238 |
+
agg, rdi, nrgc, wcr, wcr_class, fp_great, lambda_val,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
return md, html
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _build_html_report(
|
| 245 |
+
model_name, r, ds_label, threat, num_classes,
|
| 246 |
+
class_names, per_class, per_acc, max_score, vuln,
|
| 247 |
+
agg, rdi, nrgc, wcr, wcr_class, fp_great, lambda_val,
|
| 248 |
+
):
|
| 249 |
+
rdi_css = "pass" if rdi < 0.1 else ("warn" if rdi < 0.3 else "fail")
|
| 250 |
+
wcr_css = "pass" if wcr > 0.2 else ("warn" if wcr > 0.05 else "fail")
|
| 251 |
+
|
| 252 |
+
# Build per-class table rows
|
| 253 |
+
if num_classes > 30:
|
| 254 |
+
sorted_cls = sorted(per_class.keys(), key=lambda c: per_class.get(c, 0))
|
| 255 |
+
display_cls = sorted_cls[:10] + sorted_cls[-10:]
|
| 256 |
+
else:
|
| 257 |
+
display_cls = class_names
|
| 258 |
+
|
| 259 |
+
class_rows = ""
|
| 260 |
+
for cls in display_cls:
|
| 261 |
+
s = per_class.get(cls, 0)
|
| 262 |
+
a = per_acc.get(cls, 0)
|
| 263 |
+
w = int(s / max(max_score, 0.001) * 200)
|
| 264 |
+
class_rows += (
|
| 265 |
+
f"<tr><td>{cls}</td><td>{s:.4f}</td><td>{a:.1%}</td>"
|
| 266 |
+
f'<td><div class="bar" style="width:{w}px"></div></td></tr>\n'
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
vuln_rows = ""
|
| 270 |
+
total_v = len(vuln)
|
| 271 |
+
for rank, (cls, score) in enumerate((vuln[:10] if num_classes > 30 else vuln), 1):
|
| 272 |
+
if rank <= 3:
|
| 273 |
+
status = '<span class="fail">⚠ Vulnerable</span>'
|
| 274 |
+
elif rank >= total_v - 2:
|
| 275 |
+
status = '<span class="pass">✓ Robust</span>'
|
| 276 |
+
else:
|
| 277 |
+
status = '<span class="warn">— Average</span>'
|
| 278 |
+
vuln_rows += f"<tr><td>{rank}</td><td>{cls}</td><td>{score:.4f}</td><td>{status}</td></tr>\n"
|
| 279 |
+
|
| 280 |
+
if rdi >= 0.3:
|
| 281 |
+
assessment = (
|
| 282 |
+
f"<strong class='fail'>High disparity (RDI={rdi:.3f}).</strong> "
|
| 283 |
+
f"Class <em>{wcr_class}</em> is significantly more vulnerable. "
|
| 284 |
+
)
|
| 285 |
+
elif rdi >= 0.1:
|
| 286 |
+
assessment = f"<strong class='warn'>Moderate disparity (RDI={rdi:.3f}).</strong> Some classes are noticeably more vulnerable."
|
| 287 |
+
else:
|
| 288 |
+
assessment = f"<strong class='pass'>Low disparity (RDI={rdi:.3f}).</strong> Robustness is distributed relatively evenly across classes."
|
| 289 |
+
|
| 290 |
+
if wcr < 0.05:
|
| 291 |
+
assessment += f" <strong class='fail'>Critical:</strong> Worst-case class ({wcr_class}) has near-zero robustness (WCR={wcr:.4f})."
|
| 292 |
+
elif wcr < 0.2:
|
| 293 |
+
assessment += f" Worst-case class ({wcr_class}) has limited robustness (WCR={wcr:.4f})."
|
| 294 |
+
|
| 295 |
+
return f"""<!DOCTYPE html>
|
| 296 |
+
<html lang="en">
|
| 297 |
+
<head>
|
| 298 |
+
<meta charset="UTF-8">
|
| 299 |
+
<title>GF-Score Audit — {model_name}</title>
|
| 300 |
+
<style>
|
| 301 |
+
body{{font-family:'Segoe UI',sans-serif;margin:0;background:#f5f7fa;color:#333}}
|
| 302 |
+
.wrap{{max-width:860px;margin:24px auto;background:#fff;padding:36px;border-radius:10px;box-shadow:0 2px 12px rgba(0,0,0,.1)}}
|
| 303 |
+
h1{{color:#2c3e50;border-bottom:3px solid #3498db;padding-bottom:8px;font-size:1.4em}}
|
| 304 |
+
h2{{color:#34495e;margin-top:28px;font-size:1.1em}}
|
| 305 |
+
.cards{{display:flex;flex-wrap:wrap;gap:12px;margin:12px 0}}
|
| 306 |
+
.card{{background:#ecf0f1;padding:14px 18px;border-radius:8px;text-align:center;min-width:130px}}
|
| 307 |
+
.card .val{{font-size:1.7em;font-weight:700;color:#2c3e50}}
|
| 308 |
+
.card .lbl{{font-size:.7em;color:#7f8c8d;text-transform:uppercase;margin-top:2px}}
|
| 309 |
+
.pass{{color:#27ae60}}.warn{{color:#e67e22}}.fail{{color:#e74c3c}}
|
| 310 |
+
table{{border-collapse:collapse;width:100%;margin:12px 0;font-size:.9em}}
|
| 311 |
+
th{{background:#3498db;color:#fff;padding:9px 14px;text-align:left}}
|
| 312 |
+
td{{padding:7px 14px;border-bottom:1px solid #eee}}
|
| 313 |
+
tr:nth-child(even){{background:#f9f9f9}}
|
| 314 |
+
.bar{{height:14px;background:linear-gradient(90deg,#3498db,#2ecc71);border-radius:3px;display:inline-block}}
|
| 315 |
+
.footer{{margin-top:24px;padding-top:12px;border-top:1px solid #eee;font-size:.75em;color:#aaa}}
|
| 316 |
+
</style>
|
| 317 |
+
</head>
|
| 318 |
+
<body>
|
| 319 |
+
<div class="wrap">
|
| 320 |
+
<h1>🛡️ GF-Score Robustness Audit Report</h1>
|
| 321 |
+
<p><strong>Model:</strong> {model_name}<br>
|
| 322 |
+
<strong>Dataset:</strong> {ds_label} | <strong>Threat Model:</strong> {threat} | <strong>Classes:</strong> {num_classes}<br>
|
| 323 |
+
<strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M UTC')}</p>
|
| 324 |
+
|
| 325 |
+
<h2>Summary Metrics</h2>
|
| 326 |
+
<div class="cards">
|
| 327 |
+
<div class="card"><div class="val">{agg:.4f}</div><div class="lbl">GREAT Score</div></div>
|
| 328 |
+
<div class="card"><div class="val {rdi_css}">{rdi:.4f}</div><div class="lbl">RDI</div></div>
|
| 329 |
+
<div class="card"><div class="val">{nrgc:.4f}</div><div class="lbl">NRGC (Gini)</div></div>
|
| 330 |
+
<div class="card"><div class="val {wcr_css}">{wcr:.4f}</div><div class="lbl">WCR ({wcr_class})</div></div>
|
| 331 |
+
<div class="card"><div class="val">{fp_great:.4f}</div><div class="lbl">FP-GREAT (λ={lambda_val})</div></div>
|
| 332 |
+
</div>
|
| 333 |
+
|
| 334 |
+
<h2>Per-Class Robustness Profile</h2>
|
| 335 |
+
<table>
|
| 336 |
+
<tr><th>Class</th><th>GREAT Score</th><th>Clean Acc.</th><th>Visual</th></tr>
|
| 337 |
+
{class_rows}
|
| 338 |
+
</table>
|
| 339 |
+
|
| 340 |
+
<h2>Vulnerability Ranking</h2>
|
| 341 |
+
<table>
|
| 342 |
+
<tr><th>Rank</th><th>Class</th><th>Score</th><th>Status</th></tr>
|
| 343 |
+
{vuln_rows}
|
| 344 |
+
</table>
|
| 345 |
+
|
| 346 |
+
<h2>Assessment</h2>
|
| 347 |
+
<p>{assessment}</p>
|
| 348 |
+
|
| 349 |
+
<div class="footer">
|
| 350 |
+
GF-Score v0.1.0 · Based on GREAT Score (Li et al., NeurIPS 2024) extended with per-class fairness metrics ·
|
| 351 |
+
Metrics: RDI (Max Group Disparity), NRGC (Gini), WCR (Rawlsian Maximin), FP-GREAT (IHDI Adaptation)
|
| 352 |
+
</div>
|
| 353 |
+
</div>
|
| 354 |
+
</body>
|
| 355 |
+
</html>"""
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# ---------------------------------------------------------------------------
|
| 359 |
+
# Gradio App
|
| 360 |
+
# ---------------------------------------------------------------------------
|
| 361 |
+
|
| 362 |
+
def build_app():
|
| 363 |
+
available = get_available_datasets()
|
| 364 |
+
default_ds = available[0]
|
| 365 |
+
|
| 366 |
+
results_cache = {ds: load_results(ds) for ds in available}
|
| 367 |
+
|
| 368 |
+
dataset_labels = {
|
| 369 |
+
"cifar10": "CIFAR-10 (10 classes · L2 threat model · 17 models)",
|
| 370 |
+
"imagenet": "ImageNet (1000 classes · L∞ threat model · 5 models)",
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
with gr.Blocks(
|
| 374 |
+
title="GF-Score Auditing Dashboard",
|
| 375 |
+
theme=gr.themes.Soft(),
|
| 376 |
+
css=".gr-markdown table { width: 100%; }",
|
| 377 |
+
) as demo:
|
| 378 |
+
|
| 379 |
+
current_ds = gr.State(default_ds)
|
| 380 |
+
current_results = gr.State(results_cache.get(default_ds))
|
| 381 |
+
|
| 382 |
+
gr.Markdown("""
|
| 383 |
+
# 🛡️ GF-Score: Fairness-Aware Robustness Auditing Dashboard
|
| 384 |
+
|
| 385 |
+
Inspect **class-conditional adversarial robustness** of certified models with four fairness metrics
|
| 386 |
+
grounded in welfare economics. Based on [GREAT Score (NeurIPS 2024)](https://arxiv.org/abs/2304.09875),
|
| 387 |
+
extended with per-class decomposition, disparity analysis, and **attack-free** self-calibration.
|
| 388 |
+
|
| 389 |
+
| Metric | Meaning |
|
| 390 |
+
|--------|---------|
|
| 391 |
+
| **RDI** | Range of per-class robustness (Max Group Disparity) |
|
| 392 |
+
| **NRGC** | Normalized Gini Coefficient — overall inequality |
|
| 393 |
+
| **WCR** | Worst-case class robustness (Rawlsian maximin) |
|
| 394 |
+
| **FP-GREAT** | Fairness-penalized aggregate score: Ω̄ − λ·RDI |
|
| 395 |
+
""")
|
| 396 |
+
|
| 397 |
+
with gr.Row():
|
| 398 |
+
dataset_dd = gr.Dropdown(
|
| 399 |
+
choices=[(dataset_labels[ds], ds) for ds in available],
|
| 400 |
+
value=default_ds,
|
| 401 |
+
label="Dataset",
|
| 402 |
+
scale=2,
|
| 403 |
+
)
|
| 404 |
+
model_dd = gr.Dropdown(
|
| 405 |
+
choices=get_model_choices(results_cache.get(default_ds), default_ds),
|
| 406 |
+
value=(get_model_choices(results_cache.get(default_ds), default_ds) or [None])[0],
|
| 407 |
+
label="Model",
|
| 408 |
+
scale=2,
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
with gr.Row():
|
| 412 |
+
lambda_sl = gr.Slider(
|
| 413 |
+
minimum=0.0, maximum=1.0, value=0.5, step=0.05,
|
| 414 |
+
label="Fairness Penalty λ (FP-GREAT = GREAT Score − λ × RDI)",
|
| 415 |
+
scale=3,
|
| 416 |
+
)
|
| 417 |
+
analyze_btn = gr.Button("🔍 Analyze", variant="primary", scale=1)
|
| 418 |
+
|
| 419 |
+
with gr.Tabs():
|
| 420 |
+
with gr.TabItem("📊 Analysis"):
|
| 421 |
+
analysis_md = gr.Markdown()
|
| 422 |
+
with gr.TabItem("📄 Full HTML Report"):
|
| 423 |
+
report_html = gr.HTML()
|
| 424 |
+
|
| 425 |
+
# ---- callbacks ----
|
| 426 |
+
|
| 427 |
+
def on_dataset_change(ds_choice):
|
| 428 |
+
res = results_cache.get(ds_choice) or load_results(ds_choice)
|
| 429 |
+
results_cache[ds_choice] = res
|
| 430 |
+
choices = get_model_choices(res, ds_choice)
|
| 431 |
+
default_model = choices[0] if choices else None
|
| 432 |
+
return (
|
| 433 |
+
gr.update(choices=choices, value=default_model),
|
| 434 |
+
ds_choice,
|
| 435 |
+
res,
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
def run(model_name, lam, ds, res):
|
| 439 |
+
md, html = analyze_model(model_name, lam, ds, res)
|
| 440 |
+
return md, html
|
| 441 |
+
|
| 442 |
+
dataset_dd.change(
|
| 443 |
+
fn=on_dataset_change,
|
| 444 |
+
inputs=[dataset_dd],
|
| 445 |
+
outputs=[model_dd, current_ds, current_results],
|
| 446 |
+
)
|
| 447 |
+
analyze_btn.click(
|
| 448 |
+
fn=run,
|
| 449 |
+
inputs=[model_dd, lambda_sl, current_ds, current_results],
|
| 450 |
+
outputs=[analysis_md, report_html],
|
| 451 |
+
)
|
| 452 |
+
lambda_sl.release(
|
| 453 |
+
fn=run,
|
| 454 |
+
inputs=[model_dd, lambda_sl, current_ds, current_results],
|
| 455 |
+
outputs=[analysis_md, report_html],
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
gr.Markdown("""---
|
| 459 |
+
*GF-Score v0.1.0 · [Paper (NeurIPS 2026, under review)]() · [GitHub](https://github.com/aryashah00/GF-Score)*""")
|
| 460 |
+
|
| 461 |
+
return demo
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
demo = build_app()
|
| 465 |
+
|
| 466 |
+
if __name__ == "__main__":
|
| 467 |
+
demo.launch()
|