from __future__ import annotations
import json
import uuid
from html import escape
from typing import Any, Dict, List, Sequence, Tuple
def _to_data_url(raw_b64: str) -> str:
if not raw_b64:
return ""
if raw_b64.startswith("data:"):
return raw_b64
return f"data:image/png;base64,{raw_b64}"
def _strip_prefix(label: str) -> str:
if ":" in label:
return label.split(":", 1)[1]
return label
def _value_to_color(value: float, max_abs: float) -> str:
if max_abs <= 0:
return "rgb(225, 225, 223)"
norm = min(1.0, abs(value) / max_abs)
if value >= 0:
base = (225, 225, 223)
target = (1, 109, 1)
else:
base = (225, 225, 223)
target = (221, 19, 19)
r = int(round(base[0] + (target[0] - base[0]) * norm))
g = int(round(base[1] + (target[1] - base[1]) * norm))
b = int(round(base[2] + (target[2] - base[2]) * norm))
return f"rgb({r}, {g}, {b})"
def _format_region_boxes(
regions: Sequence[Dict[str, Any]],
image_size: Tuple[int, int] | None,
) -> List[str]:
if not image_size:
return []
width, height = image_size
boxes = []
for region in regions:
bbox = region.get("bbox")
if not bbox or len(bbox) != 4:
continue
x0, y0, x1, y1 = [float(v) for v in bbox]
if width <= 0 or height <= 0:
continue
left = max(0.0, min(100.0, (x0 / width) * 100))
top = max(0.0, min(100.0, (y0 / height) * 100))
w_pct = max(0.0, min(100.0, ((x1 - x0) / width) * 100))
h_pct = max(0.0, min(100.0, ((y1 - y0) / height) * 100))
idx = region.get("index", 0)
label = escape(region.get("label") or f"Region {int(idx) + 1}")
boxes.append(
"
"
"
"
)
return boxes
def create_multimodal_interaction_html(
image_b64: str,
overlay_b64: str | None,
regions: Sequence[Dict[str, Any]],
features: Sequence[Dict[str, Any]],
interactions: Sequence[Dict[str, Any]],
*,
image_size: Tuple[int, int] | None = None,
top_k: int = 20,
title: str = "Multimodal Interaction View",
) -> str:
if not image_b64:
return "No image available.
"
view_id = f"mm-interaction-{uuid.uuid4().hex[:8]}"
image_url = _to_data_url(image_b64)
overlay_url = _to_data_url(overlay_b64 or "")
max_abs = max((abs(float(item.get("value", 0.0))) for item in features), default=0.0)
region_labels = {
int(region.get("index", 0)): str(region.get("label") or f"Region {int(region.get('index', 0)) + 1}")
for region in regions
}
tokens_html = []
feature_meta: List[Dict[str, Any]] = []
for item in features:
idx = int(item.get("index", 0))
value = float(item.get("value", 0.0))
modality = item.get("modality") or "text"
ref_index = int(item.get("ref_index", idx))
raw_label = str(item.get("feature", ""))
label = _strip_prefix(raw_label)
if modality == "image":
label = region_labels.get(ref_index, label or f"Region {ref_index + 1}")
display = escape(label)
color = _value_to_color(value, max_abs)
tooltip = escape(f"{label}: {value:+.4f}")
tokens_html.append(
""
f"{display}"
""
)
feature_meta.append(
{
"index": idx,
"modality": modality,
"ref_index": ref_index,
"label": label,
"value": value,
}
)
edges = []
for item in interactions:
indices = item.get("indices")
if not indices or len(indices) != 2:
continue
try:
i = int(indices[0])
j = int(indices[1])
value = float(item.get("value", 0.0))
except Exception:
continue
if i == j:
continue
edges.append({"i": i, "j": j, "value": value})
edges.sort(key=lambda entry: abs(entry["value"]), reverse=True)
edges = edges[: max(0, int(top_k))]
payload = json.dumps(
{
"edges": edges,
"features": feature_meta,
}
)
region_boxes = _format_region_boxes(regions, image_size)
no_edges_note = "" if edges else "No interactions to display.
"
overlay_html = f"
" if overlay_url else ""
script_id = f"{view_id}-script"
loader_id = f"{view_id}-loader"
js_code = (
f"const root=document.getElementById('{view_id}');"
"if(!root){return;}"
f"const data={payload};"
"const edges=data.edges||[];"
"const features=data.features||[];"
"const tokens=[...root.querySelectorAll('.mm-token')];"
"const regions=[...root.querySelectorAll('.mm-region')];"
"const linkPanel=root.querySelector('.mm-link-list');"
"const tokenMap=new Map(tokens.map(el=>[el.dataset.featureIndex,el]));"
"const regionMap=new Map(regions.map(el=>[el.dataset.regionIndex,el]));"
"const featureMap=new Map(features.map(f=>[String(f.index),f]));"
"const regionToFeature=new Map("
"features.filter(f=>f.modality==='image').map(f=>[String(f.ref_index),String(f.index)])"
");"
"const adjacency=new Map();"
"edges.forEach(edge=>{"
"const a=String(edge.i);const b=String(edge.j);"
"if(!adjacency.has(a)){adjacency.set(a,[]);}if(!adjacency.has(b)){adjacency.set(b,[]);}"
"adjacency.get(a).push(edge);adjacency.get(b).push(edge);"
"});"
"function clearActive(){"
"tokens.forEach(el=>el.classList.remove('is-active','is-linked'));"
"regions.forEach(el=>el.classList.remove('is-active','is-linked'));"
"if(linkPanel){linkPanel.innerHTML='';}"
"}"
"function markRegion(regionIdx, cls){"
"const el=regionMap.get(String(regionIdx));"
"if(el){el.classList.add(cls);}"
"}"
"function markToken(featureIdx, cls){"
"const el=tokenMap.get(String(featureIdx));"
"if(el){el.classList.add(cls);}"
"}"
"function updateLinks(links, focusIdx){"
"if(!linkPanel){return;}"
"if(!links.length){linkPanel.innerHTML='No linked features.
';return;}"
"const rows=links.map(link=>{"
"const other=String(link.i)===focusIdx?String(link.j):String(link.i);"
"const meta=featureMap.get(other)||{};"
"const label=(meta.label||other);"
"const value=Number(link.value||0).toFixed(3);"
"return `${label}${value}
`;"
"});"
"linkPanel.innerHTML=rows.join('');"
"}"
"function highlightFeature(featureIdx){"
"const focusIdx=String(featureIdx);"
"clearActive();"
"markToken(focusIdx,'is-active');"
"const meta=featureMap.get(focusIdx);"
"if(meta&&meta.modality==='image'){markRegion(meta.ref_index,'is-active');}"
"const links=(adjacency.get(focusIdx)||[]).slice();"
"links.sort((a,b)=>Math.abs(b.value)-Math.abs(a.value));"
"const topLinks=links.slice(0,6);"
"topLinks.forEach(link=>{"
"const other=String(link.i)===focusIdx?String(link.j):String(link.i);"
"markToken(other,'is-linked');"
"const otherMeta=featureMap.get(other);"
"if(otherMeta&&otherMeta.modality==='image'){markRegion(otherMeta.ref_index,'is-linked');}"
"});"
"updateLinks(topLinks,focusIdx);"
"}"
"tokens.forEach(el=>{"
"el.addEventListener('click',()=>highlightFeature(el.dataset.featureIndex));"
"});"
"regions.forEach(el=>{"
"el.addEventListener('click',()=>{"
"const featureIdx=regionToFeature.get(el.dataset.regionIndex);"
"if(featureIdx){highlightFeature(featureIdx);}"
"});"
"});"
"const resetBtn=root.querySelector('.mm-reset');"
"if(resetBtn){resetBtn.addEventListener('click',clearActive);}"
)
return (
""
f""
"
"
f"
{escape(title)}
"
"
"
f"

"
f"{overlay_html}"
f"{''.join(region_boxes)}"
"
"
"
"
"
"
"
"
f"
{''.join(tokens_html)}
"
f"{no_edges_note}"
"
"
"
"
f"
![]()
"
f""
"
"
)