ropedia-xperience-10m-task-baselines / scripts /render_task_suite_infographic.py
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
45c1706 verified
#!/usr/bin/env python3
"""
Render a polished Ropedia Xperience-10M 12-task infographic.
The task names, inputs, and metrics are read from
results/episode_task_suite/summary_report.json. The output is a deterministic
PNG rendered from HTML/CSS so the labels stay legible and inspectable.
"""
from __future__ import annotations
import argparse
import base64
import html
import io
import json
import os
import subprocess
import tempfile
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample"
DROPBOX_SAMPLE_DIR = Path.home() / "Library/CloudStorage/Dropbox/Ropedia/data/sample/xperience-10m-sample"
DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
CANVAS_WIDTH = 1800
CANVAS_HEIGHT = 6600
THUMB_WIDTH = 880
THUMB_HEIGHT = 520
GROUPS = [
{
"name": "Label + State",
"tone": "teal",
"color": "#9bdfff",
"soft": "#071d20",
"tasks": [
("timeline_action", "supervised"),
("timeline_subtask", "supervised"),
("next_action", "supervised"),
],
},
{
"name": "Prediction + Reconstruction",
"tone": "blue",
"color": "#ccffa0",
"soft": "#10210a",
"tasks": [
("hand_trajectory_forecast", "forecast"),
("modality_reconstruction", "forecast"),
("contact_prediction", "supervised"),
],
},
{
"name": "Grounding + Retrieval",
"tone": "amber",
"color": "#7ae5c3",
"soft": "#092019",
"tasks": [
("caption_grounding", "retrieval"),
("cross_modal_retrieval", "retrieval"),
("object_relevance", "supervised"),
],
},
{
"name": "Temporal Diagnostics",
"tone": "red",
"color": "#d8f4a5",
"soft": "#1b210d",
"tasks": [
("transition_detection", "diagnostic"),
("temporal_order", "diagnostic"),
("misalignment_detection", "diagnostic"),
],
},
]
MODALITIES = [
("video", "visual stream", "6 synchronized camera MP4 streams", "RGB/fisheye/stereo frame statistics"),
("audio", "acoustic stream", "audio stream embedded in MP4", "audio feature group"),
("depth", "geometry map", "depth map + confidence channel", "spatial geometry feature block"),
("pose / SLAM", "camera pose", "trajectory + sparse SLAM map", "position + orientation features"),
("motion capture", "human motion", "body + hand joint tracks", "3D mocap feature statistics"),
("inertial", "wearable sensor", "accelerometer + gyroscope", "wearable motion statistics"),
("language", "semantic annotation", "object tags + action captions", "task labels + semantic targets"),
]
HAND_EDGES = [
(0, 1), (1, 2), (2, 3), (3, 4),
(0, 5), (5, 6), (6, 7), (7, 8),
(0, 9), (9, 10), (10, 11), (11, 12),
(0, 13), (13, 14), (14, 15), (15, 16),
(0, 17), (17, 18), (18, 19), (19, 20),
]
def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str:
buffer = io.BytesIO()
save_kwargs = {"format": fmt}
if fmt.upper() in {"JPEG", "JPG"}:
save_kwargs.update({"quality": quality, "optimize": True})
image.save(buffer, **save_kwargs)
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png"
return f"data:image/{mime};base64,{encoded}"
def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(2, 5, 2)):
from PIL import Image
return Image.new("RGB", size, color)
def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)):
from PIL import ImageOps
return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5))
def read_video_frame(video_path: Path, frame_index: int = 2400):
import cv2
from PIL import Image
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
raise RuntimeError(f"Could not open video: {video_path}")
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
if total:
frame_index = max(0, min(frame_index, total - 1))
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
ok, frame = cap.read()
cap.release()
if not ok:
raise RuntimeError(f"Could not read frame {frame_index} from {video_path}")
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
return Image.fromarray(frame)
def draw_label(draw, xy, text, fill=(244, 248, 239), size=18):
from PIL import ImageFont
try:
font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size)
except Exception:
font = ImageFont.load_default()
draw.text(xy, text, fill=fill, font=font)
def video_thumb(sample_dir: Path) -> str:
from PIL import Image, ImageDraw
gutter = 18
panel_width = (THUMB_WIDTH - gutter) // 2
fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (panel_width, THUMB_HEIGHT))
stereo_path = sample_dir / "stereo_left.mp4"
stereo = fit_image(read_video_frame(stereo_path, 2450), (panel_width, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy()
canvas = make_canvas()
canvas.paste(fish, (0, 0))
canvas.paste(stereo, (panel_width + gutter, 0))
draw = ImageDraw.Draw(canvas, "RGBA")
draw.rounded_rectangle((panel_width - 4, 0, panel_width + gutter + 4, THUMB_HEIGHT), radius=0, fill=(2, 5, 2, 220))
draw_label(draw, (18, 20), "fisheye", fill=(255, 255, 255), size=22)
draw_label(draw, (panel_width + gutter + 18, 20), "stereo", fill=(255, 255, 255), size=22)
return image_data_uri(canvas, "JPEG")
def colorize(values):
import numpy as np
stops = np.array([
[2, 5, 2],
[58, 136, 102],
[122, 229, 195],
[167, 240, 120],
[216, 244, 165],
], dtype=np.float32)
x = np.clip(values, 0, 1)
scaled = x * (len(stops) - 1)
lo = np.floor(scaled).astype(int)
hi = np.clip(lo + 1, 0, len(stops) - 1)
frac = scaled - lo
rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None]
return rgb.astype("uint8")
def depth_thumb(h5) -> str:
import numpy as np
from PIL import Image, ImageDraw
gutter = 18
panel_width = (THUMB_WIDTH - gutter) // 2
frame = np.array(h5["depth/depth"][2450], dtype=np.float32)
valid = np.isfinite(frame)
lo, hi = np.percentile(frame[valid], [3, 97])
norm = (frame - lo) / max(hi - lo, 1e-6)
rgb = colorize(norm)
depth = fit_image(Image.fromarray(rgb), (panel_width, THUMB_HEIGHT))
conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8)
conf_img = Image.fromarray(conf, mode="L").convert("RGB")
conf_img = fit_image(conf_img, (panel_width, THUMB_HEIGHT))
canvas = make_canvas()
canvas.paste(depth, (0, 0))
canvas.paste(conf_img, (panel_width + gutter, 0))
draw = ImageDraw.Draw(canvas, "RGBA")
draw.rounded_rectangle((0, 0, 158, 44), radius=8, fill=(2, 5, 2, 178))
draw.rounded_rectangle((panel_width + gutter, 0, panel_width + gutter + 220, 44), radius=8, fill=(2, 5, 2, 178))
draw_label(draw, (14, 11), "depth", fill=(255, 255, 255), size=22)
draw_label(draw, (panel_width + gutter + 14, 11), "confidence", fill=(255, 255, 255), size=22)
return image_data_uri(canvas, "JPEG")
def audio_thumb(sample_dir: Path) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
try:
raw = subprocess.run(
[
"ffmpeg",
"-v",
"error",
"-ss",
"45",
"-t",
"6",
"-i",
str(sample_dir / "fisheye_cam0.mp4"),
"-ac",
"1",
"-ar",
"16000",
"-f",
"s16le",
"pipe:1",
],
check=True,
stdout=subprocess.PIPE,
).stdout
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
if len(samples) == 0:
raise RuntimeError("empty audio stream")
samples = samples / max(float(np.max(np.abs(samples))), 1.0)
bins = 220
trimmed = samples[: bins * max(1, len(samples) // bins)]
chunks = np.array_split(trimmed, bins)
rms = np.array([np.sqrt(np.mean(chunk * chunk)) if len(chunk) else 0.0 for chunk in chunks])
waveform = np.array([float(np.mean(chunk)) if len(chunk) else 0.0 for chunk in chunks])
baseline = THUMB_HEIGHT - 72
for i, value in enumerate(rms):
x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
h = 14 + np.clip(value * 158, 0, 158)
draw.line((x, baseline, x, baseline - h), fill=(167, 240, 120, 170), width=2)
points = []
for i, value in enumerate(waveform):
x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
y = 126 - np.clip(value, -1, 1) * 82
points.append((x, y))
draw.line(points, fill=(122, 229, 195, 220), width=2)
except Exception:
for i in range(48):
x = 22 + i * 8
h = 16 + (i % 7) * 7
draw.rounded_rectangle((x, THUMB_HEIGHT - 72 - h, x + 4, THUMB_HEIGHT - 72), radius=2, fill=(167, 240, 120, 170))
draw_label(draw, (18, 18), "Audio waveform", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def normalize_points(points, width, height, pad=16):
import numpy as np
xy = points[:, :2].copy()
lo = np.percentile(xy, 2, axis=0)
hi = np.percentile(xy, 98, axis=0)
span = np.maximum(hi - lo, 1e-6)
norm = (xy - lo) / span
norm = np.clip(norm, 0, 1)
norm[:, 1] = 1 - norm[:, 1]
out = np.empty_like(norm)
out[:, 0] = pad + norm[:, 0] * (width - pad * 2)
out[:, 1] = pad + norm[:, 1] * (height - pad * 2)
return out
def slam_thumb(h5) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
points = np.array(h5["slam/point_cloud"], dtype=np.float64)
points = points[np.isfinite(points).all(axis=1)]
if len(points) > 2600:
points = points[np.linspace(0, len(points) - 1, 2600).astype(int)]
xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
z = points[:, 1]
z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6)
colors = colorize(z_norm)
for (x, y), color in zip(xy, colors):
draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,))
traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64)
traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
for a, b in zip(traj_xy[:-1], traj_xy[1:]):
draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 205), width=2)
draw_label(draw, (18, 18), "camera pose + SLAM map", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def imu_thumb(h5) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
key_idx = int(h5["imu/keyframe_indices"][2450])
accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]]
colors = [(167, 240, 120), (122, 229, 195), (155, 223, 255), (216, 244, 165), (244, 248, 239), (165, 175, 162)]
for row in range(6):
y = 68 + row * 44
draw.line((18, y, THUMB_WIDTH - 18, y), fill=(167, 240, 120, 48), width=1)
for values, color in zip(series, colors):
values = values[:420]
if len(values) < 2:
continue
lo, hi = np.percentile(values, [3, 97])
norm = (values - lo) / max(hi - lo, 1e-6)
pts = []
for i, v in enumerate(norm):
x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36)
y = THUMB_HEIGHT - 48 - np.clip(v, 0, 1) * (THUMB_HEIGHT - 116)
pts.append((x, y))
draw.line(pts, fill=color + (200,), width=2)
draw_label(draw, (18, 18), "inertial accel / gyro", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def mocap_thumb(h5) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
body = np.array(h5["full_body_mocap/keypoints"][2450], dtype=np.float32)
left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32)
right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32)
all_points = np.concatenate([body, left, right], axis=0)
lo = np.percentile(all_points[:, :2], 2, axis=0)
hi = np.percentile(all_points[:, :2], 98, axis=0)
span = np.maximum(hi - lo, 1e-6)
def project(points, x_offset, width):
xy = (points[:, :2] - lo) / span
xy[:, 1] = 1 - xy[:, 1]
xy[:, 0] = x_offset + xy[:, 0] * width
xy[:, 1] = 72 + xy[:, 1] * (THUMB_HEIGHT - 136)
return xy
body_xy = project(body, 28, 270)
for x, y in body_xy:
draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=(167, 240, 120, 185))
for a, b in zip(body_xy[:-1], body_xy[1:]):
draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 82), width=1)
for points, x_offset, color in [(left, 392, (122, 229, 195)), (right, 562, (216, 244, 165))]:
xy = project(points, x_offset, 126)
for a, b in HAND_EDGES:
draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (180,), width=2)
for x, y in xy:
draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=color + (220,))
draw_label(draw, (18, 18), "body + hand mocap", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def text_thumb(h5) -> str:
from PIL import ImageDraw
width = THUMB_WIDTH
raw = h5["caption"][()]
if isinstance(raw, bytes):
raw = raw.decode("utf-8", errors="replace")
data = json.loads(raw)
segment = data["segments"][0]
objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5]
actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2]
canvas = make_canvas((width, THUMB_HEIGHT))
draw = ImageDraw.Draw(canvas, "RGBA")
draw_label(draw, (28, 24), "language annotation", fill=(244, 248, 239), size=28)
y = 82
for label in objects:
chip_width = 52 + len(label) * 16
draw.rounded_rectangle((28, y, 28 + chip_width, y + 38), radius=8, fill=(7, 18, 7, 235), outline=(167, 240, 120, 170), width=2)
draw_label(draw, (44, y + 8), label, fill=(244, 248, 239), size=18)
y += 47
x = 340
y = 92
for action in actions:
wrapped = action[:66] + ("..." if len(action) > 66 else "")
draw.rounded_rectangle((x, y, width - 28, y + 54), radius=9, fill=(7, 18, 7, 235), outline=(122, 229, 195, 180), width=2)
draw_label(draw, (x + 22, y + 15), wrapped, fill=(244, 248, 239), size=20)
y += 68
return image_data_uri(canvas, "PNG")
def load_sample_thumbnails(sample_dir: Path | None) -> dict[str, str]:
if sample_dir is None or not sample_dir.exists():
return {}
hdf5_path = sample_dir / "annotation.hdf5"
required = [sample_dir / "fisheye_cam0.mp4", hdf5_path]
if not all(path.exists() for path in required):
return {}
try:
import h5py
thumbnails = {"video": video_thumb(sample_dir), "audio": audio_thumb(sample_dir)}
with h5py.File(hdf5_path, "r") as h5:
thumbnails.update({
"depth": depth_thumb(h5),
"pose / SLAM": slam_thumb(h5),
"motion capture": mocap_thumb(h5),
"inertial": imu_thumb(h5),
"language": text_thumb(h5),
})
return thumbnails
except Exception as exc:
print(f"Warning: could not build sample modality thumbnails: {exc}")
return {}
def valid_sample_dir(sample_dir: Path | None) -> bool:
if sample_dir is None:
return False
return (sample_dir / "annotation.hdf5").exists() and (sample_dir / "fisheye_cam0.mp4").exists()
def resolve_sample_dir(sample_dir: Path | None) -> Path | None:
candidates: list[Path] = []
env_sample_dir = os.environ.get("XPERIENCE10M_SAMPLE_DIR")
if env_sample_dir:
candidates.append(Path(env_sample_dir).expanduser())
workspace = os.environ.get("WORKSPACE")
if workspace:
candidates.append(Path(workspace).expanduser() / "data/sample/xperience-10m-sample")
if sample_dir is not None:
candidates.append(sample_dir)
candidates.extend([
DEFAULT_SAMPLE_DIR,
DROPBOX_SAMPLE_DIR,
])
for candidate in candidates:
if valid_sample_dir(candidate):
return candidate
return sample_dir
def load_summary() -> dict:
return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
def fmt(value: float) -> str:
return f"{float(value):.4f}"
def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
if task_name == "hand_trajectory_forecast":
return "MPJPE", fmt(metrics["mpjpe"])
if task_name == "cross_modal_retrieval":
return "top-5", fmt(metrics["top5_accuracy"])
if task_name == "caption_grounding":
return "MRR", fmt(metrics["mrr"])
if task_name == "object_relevance":
return "micro-F1", fmt(metrics["micro_f1"])
if task_name == "modality_reconstruction":
return "R2", fmt(metrics["r2"])
if task_name in {"temporal_order", "misalignment_detection"}:
return "F1", fmt(metrics["f1"])
if "macro_f1" in metrics:
return "macro-F1", fmt(metrics["macro_f1"])
if "accuracy" in metrics:
return "accuracy", fmt(metrics["accuracy"])
raise KeyError(f"No main metric configured for {task_name}")
def short_io(task_name: str, metrics: dict) -> str:
custom = {
"timeline_action": "all featurized modalities -> action label",
"timeline_subtask": "all featurized modalities -> subtask label",
"transition_detection": "all featurized modalities -> boundary vs steady",
"next_action": "window at t -> action at t+20 frames",
"hand_trajectory_forecast": "all featurized modalities -> future hand joints",
"contact_prediction": "non-contact modalities -> contact state",
"object_relevance": "non-caption feature blocks -> relevant objects",
"caption_grounding": "text query -> matching sensor window",
"cross_modal_retrieval": "motion / IMU / camera -> depth / video match",
"modality_reconstruction": "motion / IMU / camera -> depth / video vector",
"temporal_order": "two adjacent windows -> correct order",
"misalignment_detection": "motion + visual pair -> aligned or shifted",
}
return custom.get(task_name, metrics.get("input", ""))
def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int, neural_metrics: dict | None = None) -> str:
label, value = metric_for(task_name, metrics)
neural_html = ""
if neural_metrics and "error" not in neural_metrics:
neural_label, neural_value = metric_for(task_name, neural_metrics)
neural_html = f"""
<div class="metric neural">
<span>NN {html.escape(neural_label)}</span>
<strong>{html.escape(neural_value)}</strong>
</div>
"""
io = short_io(task_name, metrics)
return f"""
<article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};">
<div class="task-meta">
<span class="index">{index:02d}</span>
<span class="kind">{html.escape(kind)}</span>
</div>
<h3>{html.escape(task_name)}</h3>
<p>{html.escape(io)}</p>
<div class="metric">
<span>min {html.escape(label)}</span>
<strong>{html.escape(value)}</strong>
</div>
{neural_html}
</article>
"""
def modality_card(name: str, modality_type: str, sample_text: str, feature_text: str, index: int, thumbnail: str | None) -> str:
thumb_html = ""
if thumbnail:
thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>'
return f"""
<article class="modality">
<div class="modality-heading">
<div>
<span class="modality-index">{index:02d}</span>
<h3>{html.escape(name)}</h3>
</div>
<span class="modality-type">{html.escape(modality_type)}</span>
</div>
{thumb_html}
<div class="modality-copy">
<div class="modality-row">
<span>Sample contains</span>
<p>{html.escape(sample_text)}</p>
</div>
<div class="modality-row">
<span>Current baseline use</span>
<p>{html.escape(feature_text)}</p>
</div>
</div>
</article>
"""
def build_html(summary: dict, base_image: Path | None, sample_dir: Path | None) -> str:
suite = summary["tasks"]
neural_suite = summary.get("neural_tasks", {})
thumbnails = load_sample_thumbnails(sample_dir)
base_layer = ""
if base_image is not None and base_image.exists():
base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>'
stats = [
(f"{summary['num_frames']:,}", "frames"),
(f"{summary['num_windows']:,}", "windows"),
(f"{summary['feature_dim']:,}", "features"),
(f"{len(suite)}+{len(neural_suite)}", "min + NN tasks"),
("70/30", "chronological split"),
]
stats_html = "".join(
f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>"
for value, label in stats
)
modalities_html = "".join(
modality_card(name, modality_type, sample_text, feature_text, index, thumbnails.get(name))
for index, (name, modality_type, sample_text, feature_text) in enumerate(MODALITIES, start=1)
)
task_index = 1
families = []
for group in GROUPS:
cards = []
for task_name, kind in group["tasks"]:
cards.append(task_card(task_name, kind, suite[task_name], group, task_index, neural_suite.get(task_name)))
task_index += 1
families.append(
f"""
<section class="family" style="--accent:{group['color']};--soft:{group['soft']};">
<div class="family-head">
<span>{html.escape(group['tone'])}</span>
<h2>{html.escape(group['name'])}</h2>
</div>
<div class="family-cards">{''.join(cards)}</div>
</section>
"""
)
return f"""<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1">
<title>Xperience-10M 12-Task Episode Suite Infographic</title>
<style>
* {{ box-sizing: border-box; }}
html,
body {{
margin: 0;
width: {CANVAS_WIDTH}px;
height: {CANVAS_HEIGHT}px;
background: #020502;
}}
body {{
font-family: "Inter Tight", "Space Grotesk", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
color: #f4f8ef;
text-rendering: optimizeLegibility;
}}
.canvas {{
position: relative;
width: {CANVAS_WIDTH}px;
height: {CANVAS_HEIGHT}px;
overflow: hidden;
padding: 54px 64px 44px;
background:
radial-gradient(circle at 72% 10%, rgba(167,240,120,0.18), transparent 24%),
radial-gradient(circle at 20% 28%, rgba(255,255,255,0.10) 1px, transparent 2px),
#020502;
background-size: auto, 18px 18px, auto;
}}
.image-background {{
position: absolute;
inset: 0;
background-position: center;
background-repeat: no-repeat;
background-size: cover;
opacity: 0.36;
filter: saturate(1.05) contrast(1.08) brightness(0.42);
}}
.content {{
position: relative;
z-index: 1;
}}
.header {{
display: grid;
grid-template-columns: 1.25fr 0.75fr;
gap: 44px;
align-items: end;
padding-bottom: 30px;
border-bottom: 1px solid rgba(167,240,120,0.20);
}}
.kicker {{
display: inline-flex;
align-items: center;
gap: 12px;
color: #ccffa0;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 15px;
text-transform: uppercase;
letter-spacing: 0.08em;
}}
.kicker::before {{
content: "";
width: 44px;
height: 1px;
background: #ccffa0;
}}
h1 {{
margin: 18px 0 0;
max-width: 930px;
font-size: 72px;
line-height: 0.95;
letter-spacing: 0;
}}
.subtitle {{
margin: 18px 0 0;
max-width: 900px;
color: #dce8d7;
font-size: 23px;
line-height: 1.35;
font-weight: 520;
}}
.stats {{
display: grid;
grid-template-columns: repeat(5, minmax(0, 1fr));
gap: 10px;
}}
.stat {{
min-height: 78px;
padding: 14px 15px;
border: 1px solid rgba(167,240,120,0.24);
background: rgba(7,18,7,0.80);
border-radius: 8px;
}}
.stat strong {{
display: block;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 25px;
line-height: 1;
font-variant-numeric: tabular-nums;
}}
.stat span {{
display: block;
margin-top: 8px;
color: #a5afa2;
font-size: 13px;
line-height: 1.15;
}}
.section-label {{
display: grid;
grid-template-columns: 1fr;
gap: 12px;
align-items: start;
margin: 44px 0 24px;
color: #a5afa2;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 22px;
text-transform: uppercase;
letter-spacing: 0.08em;
}}
.section-label span:last-child {{
max-width: 1400px;
color: #dce8d7;
text-transform: none;
letter-spacing: 0;
font-family: inherit;
font-size: 21px;
line-height: 1.42;
text-align: left;
}}
.modalities {{
display: grid;
grid-template-columns: 1fr;
gap: 34px;
}}
.modality {{
min-height: 560px;
padding: 34px;
border: 1px solid rgba(167,240,120,0.22);
background: rgba(7,18,7,0.84);
border-radius: 8px;
display: grid;
grid-template-columns: 880px minmax(0, 1fr);
grid-template-areas:
"thumb heading"
"thumb copy";
column-gap: 46px;
row-gap: 28px;
align-items: start;
}}
.modality-thumb {{
grid-area: thumb;
height: 492px;
overflow: hidden;
border: 1px solid rgba(167,240,120,0.16);
border-radius: 8px;
background: #020502;
}}
.modality-thumb img {{
display: block;
width: 100%;
height: 100%;
object-fit: cover;
}}
.modality-index,
.index {{
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-variant-numeric: tabular-nums;
}}
.modality-heading {{
grid-area: heading;
display: flex;
align-items: start;
justify-content: space-between;
gap: 24px;
padding-bottom: 26px;
border-bottom: 1px solid rgba(167,240,120,0.16);
}}
.modality-index {{
color: #a5afa2;
font-size: 24px;
}}
.modality-type {{
color: #ccffa0;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 16px;
line-height: 1.15;
text-transform: uppercase;
letter-spacing: 0.08em;
text-align: right;
max-width: 330px;
padding-top: 8px;
}}
.modality h3 {{
margin: 14px 0 0;
font-size: 76px;
line-height: 0.98;
text-transform: uppercase;
}}
.modality-copy {{
grid-area: copy;
display: grid;
grid-template-columns: 1fr;
gap: 22px;
}}
.modality-row {{
display: grid;
grid-template-columns: 1fr;
gap: 10px;
align-items: baseline;
padding: 22px 24px;
border: 1px solid rgba(167,240,120,0.16);
border-radius: 8px;
background: rgba(2,5,2,0.40);
}}
.modality-row span {{
display: block;
color: #a5afa2;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 16px;
letter-spacing: 0.06em;
line-height: 1.25;
text-transform: uppercase;
}}
.modality-row p {{
margin: 0;
color: #dce8d7;
font-size: 40px;
font-weight: 650;
line-height: 1.15;
}}
.shared-band {{
display: grid;
grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr;
gap: 12px;
align-items: center;
margin-top: 30px;
padding: 14px;
border: 1px solid rgba(167,240,120,0.22);
background: rgba(7,18,7,0.72);
border-radius: 8px;
}}
.step {{
min-height: 62px;
padding: 13px 15px;
background: rgba(7,18,7,0.92);
border: 1px solid rgba(167,240,120,0.16);
border-radius: 8px;
}}
.step strong {{
display: block;
font-size: 17px;
line-height: 1.1;
}}
.step span {{
display: block;
margin-top: 5px;
color: #a5afa2;
font-size: 13px;
}}
.arrow {{
color: #ccffa0;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 22px;
}}
.families {{
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 24px;
margin-top: 30px;
}}
.family {{
padding: 20px;
border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
background: rgba(7,18,7,0.82);
border-radius: 8px;
}}
.family-head {{
display: flex;
align-items: end;
justify-content: space-between;
gap: 16px;
min-height: 66px;
padding-bottom: 16px;
border-bottom: 1px solid color-mix(in srgb, var(--accent) 24%, #020502);
}}
.family-head span {{
color: var(--accent);
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 12px;
text-transform: uppercase;
letter-spacing: 0.08em;
}}
.family-head h2 {{
margin: 0;
color: var(--accent);
font-size: 32px;
line-height: 1.02;
text-align: right;
}}
.family-cards {{
display: grid;
gap: 16px;
margin-top: 18px;
}}
.task-card {{
min-height: 178px;
padding: 18px 20px;
border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
background: linear-gradient(180deg, rgba(10,24,10,0.96), color-mix(in srgb, var(--soft) 24%, #071207));
border-radius: 8px;
}}
.task-meta {{
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
}}
.index {{
color: #a5afa2;
font-size: 12px;
}}
.kind {{
display: inline-flex;
align-items: center;
height: 24px;
padding: 0 9px;
border-radius: 6px;
border: 1px solid color-mix(in srgb, var(--accent) 40%, #020502);
color: var(--accent);
background: rgba(2,5,2,0.48);
text-transform: uppercase;
font-size: 11px;
line-height: 1;
font-weight: 830;
}}
.task-card h3 {{
margin: 12px 0 0;
color: #f4f8ef;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 21px;
line-height: 1.18;
overflow-wrap: anywhere;
}}
.task-card p {{
margin: 11px 0 0;
min-height: 39px;
color: #dce8d7;
font-size: 15px;
line-height: 1.28;
font-weight: 560;
}}
.metric {{
display: inline-flex;
align-items: baseline;
gap: 10px;
margin-top: 10px;
min-height: 32px;
padding: 7px 10px;
border-radius: 8px;
border: 1px solid color-mix(in srgb, var(--accent) 42%, #020502);
background: rgba(2,5,2,0.42);
}}
.metric.neural {{
margin-left: 8px;
border-color: rgba(255,255,255,0.20);
background: rgba(255,255,255,0.08);
}}
.metric span {{
color: #a5afa2;
font-size: 13px;
font-weight: 760;
}}
.metric strong {{
color: var(--accent);
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 20px;
line-height: 1;
font-weight: 860;
font-variant-numeric: tabular-nums;
}}
.footer {{
display: flex;
align-items: center;
justify-content: space-between;
gap: 32px;
margin-top: 22px;
padding-top: 20px;
border-top: 1px solid rgba(167,240,120,0.20);
color: #a5afa2;
font-size: 18px;
line-height: 1.35;
font-weight: 620;
}}
.footer code {{
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
color: #020502;
background: #ccffa0;
border: 1px solid #ccffa0;
border-radius: 7px;
padding: 6px 9px;
white-space: nowrap;
}}
</style>
</head>
<body>
<main class="canvas" aria-label="Ropedia Xperience-10M 12-task suite infographic">
{base_layer}
<div class="content">
<header class="header">
<div>
<div class="kicker">verified single-episode task suite</div>
<h1>Ropedia Xperience-10M 12-task suite</h1>
<p class="subtitle">A clean map from synchronized multimodal windows to 12 research task heads, comparing minimal heads with neural MLP results. Next milestone: Qwen3-Omni fine-tuning with sensor-bridge evaluation.</p>
</div>
<div class="stats">{stats_html}</div>
</header>
<section class="shared-band" aria-label="shared processing contract">
<div class="step"><strong>raw public episode</strong><span>video, audio, depth, pose, mocap, IMU, language</span></div>
<div class="arrow">-></div>
<div class="step"><strong>20-frame windows</strong><span>stride 5, chronological order</span></div>
<div class="arrow">-></div>
<div class="step"><strong>{summary['feature_dim']:,}-d vector</strong><span>current manifest includes audio features</span></div>
<div class="arrow">-></div>
<div class="step"><strong>12 minimal + NN heads</strong><span>softmax/ridge/logistic plus PyTorch MLP</span></div>
</section>
<div class="section-label">
<span>12 task families</span>
<span>Every task below has a minimal baseline and a neural MLP head over the same aligned window contract, making the suite easy to compare, extend, and scale to held-out episodes.</span>
</div>
<section class="families">{''.join(families)}</section>
<div class="section-label">
<span>Xperience-10M modalities</span>
<span>Public-sample thumbnails are enlarged here so each data stream is legible. Audio is present in the sample MP4 stream and is now extracted into the current baseline manifest.</span>
</div>
<section class="modalities">{modalities_html}</section>
<footer class="footer">
<span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span>
<code>results/episode_task_suite/summary_report.json</code>
</footer>
</div>
</main>
</body>
</html>
"""
def render_html(html_path: Path, output_path: Path) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(
[
"npx",
"--yes",
"playwright",
"screenshot",
"--full-page",
f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}",
html_path.resolve().as_uri(),
str(output_path),
],
check=True,
)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--html", type=Path)
parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.")
args = parser.parse_args()
summary = load_summary()
sample_dir = resolve_sample_dir(args.sample_dir)
html_text = build_html(summary, args.base_image, sample_dir)
if args.html is None:
with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
handle.write(html_text)
html_path = Path(handle.name)
else:
html_path = args.html
html_path.parent.mkdir(parents=True, exist_ok=True)
html_path.write_text(html_text, encoding="utf-8")
if not args.no_export:
render_html(html_path, args.output)
print(f"Wrote image: {args.output}")
print(f"Wrote render HTML: {html_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())