from __future__ import annotations import json import uuid from html import escape from typing import Any, Dict, List, Sequence, Tuple def _to_data_url(raw_b64: str) -> str: if not raw_b64: return "" if raw_b64.startswith("data:"): return raw_b64 return f"data:image/png;base64,{raw_b64}" def _strip_prefix(label: str) -> str: if ":" in label: return label.split(":", 1)[1] return label def _value_to_color(value: float, max_abs: float) -> str: if max_abs <= 0: return "rgb(225, 225, 223)" norm = min(1.0, abs(value) / max_abs) if value >= 0: base = (225, 225, 223) target = (1, 109, 1) else: base = (225, 225, 223) target = (221, 19, 19) r = int(round(base[0] + (target[0] - base[0]) * norm)) g = int(round(base[1] + (target[1] - base[1]) * norm)) b = int(round(base[2] + (target[2] - base[2]) * norm)) return f"rgb({r}, {g}, {b})" def _format_region_boxes( regions: Sequence[Dict[str, Any]], image_size: Tuple[int, int] | None, ) -> List[str]: if not image_size: return [] width, height = image_size boxes = [] for region in regions: bbox = region.get("bbox") if not bbox or len(bbox) != 4: continue x0, y0, x1, y1 = [float(v) for v in bbox] if width <= 0 or height <= 0: continue left = max(0.0, min(100.0, (x0 / width) * 100)) top = max(0.0, min(100.0, (y0 / height) * 100)) w_pct = max(0.0, min(100.0, ((x1 - x0) / width) * 100)) h_pct = max(0.0, min(100.0, ((y1 - y0) / height) * 100)) idx = region.get("index", 0) label = escape(region.get("label") or f"Region {int(idx) + 1}") boxes.append( "
" "
" ) return boxes def create_multimodal_interaction_html( image_b64: str, overlay_b64: str | None, regions: Sequence[Dict[str, Any]], features: Sequence[Dict[str, Any]], interactions: Sequence[Dict[str, Any]], *, image_size: Tuple[int, int] | None = None, top_k: int = 20, title: str = "Multimodal Interaction View", ) -> str: if not image_b64: return "
No image available.
" view_id = f"mm-interaction-{uuid.uuid4().hex[:8]}" image_url = _to_data_url(image_b64) overlay_url = _to_data_url(overlay_b64 or "") max_abs = max((abs(float(item.get("value", 0.0))) for item in features), default=0.0) region_labels = { int(region.get("index", 0)): str(region.get("label") or f"Region {int(region.get('index', 0)) + 1}") for region in regions } tokens_html = [] feature_meta: List[Dict[str, Any]] = [] for item in features: idx = int(item.get("index", 0)) value = float(item.get("value", 0.0)) modality = item.get("modality") or "text" ref_index = int(item.get("ref_index", idx)) raw_label = str(item.get("feature", "")) label = _strip_prefix(raw_label) if modality == "image": label = region_labels.get(ref_index, label or f"Region {ref_index + 1}") display = escape(label) color = _value_to_color(value, max_abs) tooltip = escape(f"{label}: {value:+.4f}") tokens_html.append( "" f"{display}" "" ) feature_meta.append( { "index": idx, "modality": modality, "ref_index": ref_index, "label": label, "value": value, } ) edges = [] for item in interactions: indices = item.get("indices") if not indices or len(indices) != 2: continue try: i = int(indices[0]) j = int(indices[1]) value = float(item.get("value", 0.0)) except Exception: continue if i == j: continue edges.append({"i": i, "j": j, "value": value}) edges.sort(key=lambda entry: abs(entry["value"]), reverse=True) edges = edges[: max(0, int(top_k))] payload = json.dumps( { "edges": edges, "features": feature_meta, } ) region_boxes = _format_region_boxes(regions, image_size) no_edges_note = "" if edges else "
No interactions to display.
" overlay_html = f"overlay" if overlay_url else "" script_id = f"{view_id}-script" loader_id = f"{view_id}-loader" js_code = ( f"const root=document.getElementById('{view_id}');" "if(!root){return;}" f"const data={payload};" "const edges=data.edges||[];" "const features=data.features||[];" "const tokens=[...root.querySelectorAll('.mm-token')];" "const regions=[...root.querySelectorAll('.mm-region')];" "const linkPanel=root.querySelector('.mm-link-list');" "const tokenMap=new Map(tokens.map(el=>[el.dataset.featureIndex,el]));" "const regionMap=new Map(regions.map(el=>[el.dataset.regionIndex,el]));" "const featureMap=new Map(features.map(f=>[String(f.index),f]));" "const regionToFeature=new Map(" "features.filter(f=>f.modality==='image').map(f=>[String(f.ref_index),String(f.index)])" ");" "const adjacency=new Map();" "edges.forEach(edge=>{" "const a=String(edge.i);const b=String(edge.j);" "if(!adjacency.has(a)){adjacency.set(a,[]);}if(!adjacency.has(b)){adjacency.set(b,[]);}" "adjacency.get(a).push(edge);adjacency.get(b).push(edge);" "});" "function clearActive(){" "tokens.forEach(el=>el.classList.remove('is-active','is-linked'));" "regions.forEach(el=>el.classList.remove('is-active','is-linked'));" "if(linkPanel){linkPanel.innerHTML='';}" "}" "function markRegion(regionIdx, cls){" "const el=regionMap.get(String(regionIdx));" "if(el){el.classList.add(cls);}" "}" "function markToken(featureIdx, cls){" "const el=tokenMap.get(String(featureIdx));" "if(el){el.classList.add(cls);}" "}" "function updateLinks(links, focusIdx){" "if(!linkPanel){return;}" "if(!links.length){linkPanel.innerHTML='
No linked features.
';return;}" "const rows=links.map(link=>{" "const other=String(link.i)===focusIdx?String(link.j):String(link.i);" "const meta=featureMap.get(other)||{};" "const label=(meta.label||other);" "const value=Number(link.value||0).toFixed(3);" "return ``;" "});" "linkPanel.innerHTML=rows.join('');" "}" "function highlightFeature(featureIdx){" "const focusIdx=String(featureIdx);" "clearActive();" "markToken(focusIdx,'is-active');" "const meta=featureMap.get(focusIdx);" "if(meta&&meta.modality==='image'){markRegion(meta.ref_index,'is-active');}" "const links=(adjacency.get(focusIdx)||[]).slice();" "links.sort((a,b)=>Math.abs(b.value)-Math.abs(a.value));" "const topLinks=links.slice(0,6);" "topLinks.forEach(link=>{" "const other=String(link.i)===focusIdx?String(link.j):String(link.i);" "markToken(other,'is-linked');" "const otherMeta=featureMap.get(other);" "if(otherMeta&&otherMeta.modality==='image'){markRegion(otherMeta.ref_index,'is-linked');}" "});" "updateLinks(topLinks,focusIdx);" "}" "tokens.forEach(el=>{" "el.addEventListener('click',()=>highlightFeature(el.dataset.featureIndex));" "});" "regions.forEach(el=>{" "el.addEventListener('click',()=>{" "const featureIdx=regionToFeature.get(el.dataset.regionIndex);" "if(featureIdx){highlightFeature(featureIdx);}" "});" "});" "const resetBtn=root.querySelector('.mm-reset');" "if(resetBtn){resetBtn.addEventListener('click',clearActive);}" ) return ( "" f"
" "
" f"
{escape(title)}
" "
" f"input image" f"{overlay_html}" f"{''.join(region_boxes)}" "
" "
" "
" "
" "
Features
" "" "
" f"
{''.join(tokens_html)}
" f"{no_edges_note}" "" "
" f"" f"" "
" )