# -*- coding: utf-8 -*-
"""
DEFER-RL Radiologist Reader Study - Gradio app (deployable as a Hugging Face Space).
Design goal: judging one case needs ZERO scrolling-to-understand and ZERO guessing.
Every rating control sits directly under the image it refers to, every scale legend is
printed inline, every rating value has a hover tooltip, and each image has its own
display-only zoom / brightness / contrast strip.
Study adaptation
----------------
Each CASE shows, for one patient study:
* LEFT cell of every row = the REFERENCE imaging (always visible for comparison).
* RIGHT cell, ONE ROW PER ITEM = an anonymized deferral-system DECISION panel
(DEFER-RL plus baselines). Order is randomized per (annotator, case); names hidden.
* FINAL ROW = the reference-standard / ground-truth panel.
The reader rates each panel's decision; the backend DERIVES best/worst/rankings.
"""
import os, io, json, base64, random, hashlib, datetime, threading, pathlib
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import gradio as gr
# ----------------------------------------------------------------------------- config
N_PER_PAGE_NOTE = "" # placeholder
SCHEMA_VERSION = "deferrl-reader-1"
DATA_DIR = pathlib.Path(os.environ.get("DATA_DIR", "data")) # case manifest + images
RESP_DIR = pathlib.Path(os.environ.get("RESP_DIR", "responses_local")) # local response store
DATASET_REPO = os.environ.get("DATASET_REPO", "").strip() # private HF dataset id, e.g. "org/deferrl-reader-responses"
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
COMMIT_EVERY_MIN= float(os.environ.get("COMMIT_EVERY_MIN", "1")) # sync cadence to the dataset
MAX_ITEMS = int(os.environ.get("MAX_ITEMS", "5")) # max decision panels per case (build budget)
RESP_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)
# Per-user credentials. On the Space, set the secret ANNOTATORS to a JSON object
# {"dr_smith": "their-password", ...}. The username they type is their annotator name
# and keys their own response file. Falls back to a demo login for local testing.
try:
ANNOTATORS = json.loads(os.environ.get("ANNOTATORS", "")) or {"demo": "demo"}
except Exception:
ANNOTATORS = {"demo": "demo"}
# Rating dimensions (anchored 1-5 Likert + one inline yes/no judgement).
LIKERT = [
("1", "Clearly inappropriate: a decision a careful radiologist would not make here"),
("2", "Probably inappropriate"),
("3", "Equivocal / borderline"),
("4", "Probably appropriate"),
("5", "Clearly appropriate: the decision a careful radiologist would endorse"),
]
LIKERT_ADEQ = [
("1", "Clearly inadequate: key evidence was ignored or never gathered"),
("2", "Probably inadequate"),
("3", "Equivocal"),
("4", "Probably adequate"),
("5", "Clearly adequate: the evidence gathered was sufficient and on point"),
]
LIKERT_SOUND = [
("1", "Clearly unsound: the committed reading is wrong on a clinically important point"),
("2", "Probably unsound"),
("3", "Equivocal"),
("4", "Probably sound"),
("5", "Clearly sound: the committed reading is clinically correct"),
]
YESNO = [("yes", "Yes"), ("no", "No")]
YPN = [("yes", "Yes"), ("partial", "Partial"), ("no", "No")]
# required dimensions per item; soundness only applies when the panel chose Trust
REQ_ITEM_DIMS_TRUST = ["appropriateness", "evidence", "soundness", "misleading"]
REQ_ITEM_DIMS_DEFER = ["appropriateness", "evidence", "misleading"]
REQ_CASE_DIMS = ["unsafe_to_autoread", "reference_adequate"]
# ----------------------------------------------------------------------------- synthetic sample data
def _font(sz):
try:
return ImageFont.truetype("DejaVuSans.ttf", sz)
except Exception:
return ImageFont.load_default()
def make_placeholder(text, seed, w=360, h=300):
"""A clearly-labelled grayscale CT/MRI-like placeholder so the Space runs and is
obviously recognizable as sample data before real cases are loaded."""
rng = np.random.default_rng(seed)
base = rng.normal(70, 16, (h, w)).clip(0, 255)
yy, xx = np.mgrid[0:h, 0:w]
cx, cy = int(rng.integers(w // 4, 3 * w // 4)), int(rng.integers(h // 3, 3 * h // 4))
r = int(rng.integers(26, 52))
blob = np.exp(-(((xx - cx) ** 2 + (yy - cy) ** 2) / (2 * r * r))) * rng.integers(80, 150)
img = (base + blob).clip(0, 255).astype("uint8")
im = Image.fromarray(img, "L").convert("RGB")
d = ImageDraw.Draw(im)
# lesion-style marker ring so the "image" reads as a scan
d.ellipse([cx - r, cy - r, cx + r, cy + r], outline=(255, 205, 70), width=3)
# high-contrast top label banner (multi-line)
lines = text.split("\n")
bh = 19 * len(lines) + 10
d.rectangle([0, 0, w, bh], fill=(18, 20, 32))
d.text((8, 6), text, fill=(255, 255, 255), font=_font(16))
# unmistakable footer so testers know this is not real data
d.rectangle([0, h - 22, w, h], fill=(60, 20, 20))
d.text((8, h - 20), "SYNTHETIC SAMPLE - replace with real data", fill=(255, 190, 190), font=_font(12))
d.rectangle([0, 0, w - 1, h - 1], outline=(150, 150, 150))
return im
def synth_cases(n=6, k=4):
"""Build a sample manifest + images. Real runs replace data/cases.json + data/images/."""
imgdir = DATA_DIR / "images"; imgdir.mkdir(parents=True, exist_ok=True)
systems = ["defer_rl", "atcxr", "conformal_l2d", "always_defer", "chow_rule"][:k]
cohorts = ["LIDC-IDRI chest CT", "NLST LongCT", "Duke Breast MRI"]
cases = []
for i in range(n):
cid = f"C{i+1:03d}"
coh = cohorts[i % len(cohorts)]
make_placeholder(f"{cid} REFERENCE\n{coh}", seed=i * 100 + 1).save(imgdir / f"{cid}_ref.png")
make_placeholder(f"{cid} REFERENCE STANDARD", seed=i * 100 + 2).save(imgdir / f"{cid}_gt.png")
items = []
for j, s in enumerate(systems):
action = "Defer" if (i + j) % 3 == 0 or s == "always_defer" else "Trust"
reading = "(routed to radiologist)" if action == "Defer" else \
("No suspicious finding. BI-RADS 1." if (i + j) % 2 else "Indeterminate nodule, recommend follow-up.")
fn = f"{cid}_{s}.png"
make_placeholder(f"{cid} panel\n{action}", seed=i * 100 + 10 + j).save(imgdir / fn)
trail = []
for t in range(3):
tfn = f"{cid}_{s}_t{t}.png"
make_placeholder(f"step {t+1}", seed=i * 1000 + j * 10 + t, w=120, h=100).save(imgdir / tfn)
trail.append(f"images/{tfn}")
items.append({"item_id": s, "action": action, "reading": reading,
"image": f"images/{fn}", "trail": trail})
cases.append({
"case_id": cid, "cohort": coh,
"reference_image": f"images/{cid}_ref.png",
"show_trail": (i % 2 == 0), # evidence-trail ablation condition
"ground_truth": {"image": f"images/{cid}_gt.png",
"text": f"Reference standard for {cid}: 8 mm spiculated nodule, right upper lobe; "
f"path-confirmed malignant." if i % 2 else
f"Reference standard for {cid}: no malignant finding; benign granuloma."},
"items": items,
})
(DATA_DIR / "cases.json").write_text(json.dumps(cases, indent=2), encoding="utf-8")
return cases
def load_cases():
f = DATA_DIR / "cases.json"
if f.exists():
return json.loads(f.read_text(encoding="utf-8"))
return synth_cases()
CASES = load_cases()
N_CASES = len(CASES)
CASE_BY_ID = {c["case_id"]: c for c in CASES}
import functools
DISPLAY_MAX = int(os.environ.get("DISPLAY_MAX", "800")) # cap longest display side; zoom still available
@functools.lru_cache(maxsize=2048)
def b64img(rel_path):
"""Downscaled PNG as base64 (cached, so a reference reused across rows is encoded once)."""
p = DATA_DIR / rel_path
try:
im = Image.open(p).convert("RGB")
except Exception:
im = make_placeholder("missing image", 0)
if max(im.size) > DISPLAY_MAX:
im.thumbnail((DISPLAY_MAX, DISPLAY_MAX))
buf = io.BytesIO(); im.save(buf, "PNG")
return base64.b64encode(buf.getvalue()).decode("ascii")
# ----------------------------------------------------------------------------- storage (robust schema)
# One JSONL line per (annotator, case_id, item_id, dimension) -> value. Atomic, self-describing,
# so any later UI/layout/wording change can never overwrite or invalidate prior annotations.
_LOCK = threading.Lock()
def _resp_path(annotator):
safe = "".join(ch for ch in annotator if ch.isalnum() or ch in "._-") or "anon"
return RESP_DIR / f"{safe}.jsonl"
def append_records(annotator, records):
with _LOCK:
with open(_resp_path(annotator), "a", encoding="utf-8") as fh:
for r in records:
fh.write(json.dumps(r, ensure_ascii=False) + "\n")
def read_records(annotator):
p = _resp_path(annotator)
if not p.exists():
return []
out = []
with open(p, encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if line:
try: out.append(json.loads(line))
except Exception: pass
return out
def latest_values(annotator, case_id):
"""Collapse the append-only log to the most recent value per (item_id, dimension)."""
vals = {}
for r in read_records(annotator):
if r.get("case_id") != case_id:
continue
vals[(r.get("item_id"), r.get("dimension"))] = (r.get("value"), r.get("ts", ""))
out = {}
for (iid, dim), (val, _ts) in vals.items():
out.setdefault(iid, {})[dim] = val
return out
def case_complete(annotator, case, values=None):
v = values if values is not None else latest_values(annotator, case["case_id"])
cl = v.get("__case__", {})
if any(cl.get(d) in (None, "") for d in REQ_CASE_DIMS):
return False
for it in case["items"]:
req = REQ_ITEM_DIMS_TRUST if it["action"] == "Trust" else REQ_ITEM_DIMS_DEFER
got = v.get(it["item_id"], {})
if any(got.get(d) in (None, "") for d in req):
return False
return True
def progress_counts(annotator):
done = sum(1 for c in CASES if case_complete(annotator, c))
return done
def first_unfinished(annotator):
for i, c in enumerate(CASES):
if not case_complete(annotator, c):
return i
return 0 # all done -> show first
# Sync local responses to a private HF dataset (best-effort; app still runs locally without it).
SCHEDULER = None
if DATASET_REPO and HF_TOKEN:
try:
from huggingface_hub import CommitScheduler
SCHEDULER = CommitScheduler(
repo_id=DATASET_REPO, repo_type="dataset", folder_path=str(RESP_DIR),
path_in_repo="responses", every=COMMIT_EVERY_MIN, token=HF_TOKEN, private=True,
squash_history=False,
)
print(f"[storage] CommitScheduler -> {DATASET_REPO} every {COMMIT_EVERY_MIN} min")
except Exception as e:
print(f"[storage] CommitScheduler disabled ({e}); responses stay local under {RESP_DIR}")
else:
print(f"[storage] No DATASET_REPO/HF_TOKEN set; responses stay local under {RESP_DIR}")
# ----------------------------------------------------------------------------- presentation order (blinding)
def presented_items(annotator, case):
seed = int(hashlib.sha256(f"{annotator}|{case['case_id']}".encode()).hexdigest(), 16) % (2**32)
rng = random.Random(seed)
items = list(case["items"])
rng.shuffle(items)
return items # stable per (annotator, case) so resume/prefill map correctly
# ----------------------------------------------------------------------------- HTML rendering
def _esc(s): return (str(s).replace("&", "&").replace("<", "<").replace(">", ">"))
def img_cell(uid, b64, caption, h=210):
return f"""
{_esc(caption)}
☀◐
"""
def trail_html(uid_base, trail):
if not trail:
return ""
thumbs = "".join(
f''
for i, t in enumerate(trail))
return f'
Evidence trail the system examined:{thumbs}
'
def radio_group(item_id, dim, label, legend, options, prefill, qlevel="item"):
pre = (prefill or {}).get(item_id, {}).get(dim) if qlevel == "item" else (prefill or {}).get("__case__", {}).get(dim)
opts = ""
for val, tip in options:
chk = "checked" if str(pre) == str(val) else ""
opts += (f'')
return (f'
What you are looking at. The lettered panels below are decisions produced by several
automated systems on this same {_esc(case.get('cohort','imaging'))} study. Their order is shuffled and
their identities are hidden. At least one panel is a simple reference baseline (for example, a system
that always defers, or one that uses only a confidence threshold). Each panel states the system's
action — Trust (it commits an automated reading) or Defer (it routes the case
to a radiologist) — the evidence it examined, and, when it chose Trust, the reading it produced.
You are rating each panel's decision on its own; you are never asked to pick a best or worst panel.
You can stop and resume at any time — your answers are saved when you press
Save & Next, and you will return to the first case you have not finished.
"""
# Case-level judgements (answer your own read first, before scoring the panels)
caselevel = f"""
Your own read first (before scoring the panels)
{radio_group("__case__","unsafe_to_autoread",
"Should this case not be auto-read? ",
"(Is this a study where an automated reading should not be trusted, and a radiologist must see it?)",
YPN, prefill, qlevel="case")}
{radio_group("__case__","reference_adequate",
"Is the reference imaging adequate to judge this case? ",
"(yes / partial / no)", YPN, prefill, qlevel="case")}
"""
rows = ""
for p, it in enumerate(pres):
letter = LETTERS[p]
ref_uid = f"ref-{idx}-{p}"
it_uid = f"it-{idx}-{p}"
action = it["action"]
if action == "Trust":
actline = (f'
'
# soundness only when the panel committed a reading
if action == "Trust":
sound = radio_group(it["item_id"], "soundness",
"Was the committed reading clinically sound? ",
"(1 clearly unsound · 3 equivocal · 5 clearly sound)",
LIKERT_SOUND, prefill)
else:
sound = ('
Reading soundness
'
'
Not applicable — this panel deferred and committed no reading.
')
mislead = (f'
Was this decision '
f'misleading?
'
f'
misleading = the decision looks confident and plausible but is '
f'clinically wrong. Example: the system Trusts a “no nodule” reading on a '
f'study that in fact shows an 8 mm spiculated nodule; or it Defers an obviously '
f'benign study as if it were dangerous.