| |
| """ |
| Render a polished Ropedia Xperience-10M 12-task infographic. |
| |
| The task names, inputs, and metrics are read from |
| results/episode_task_suite/summary_report.json. The output is a deterministic |
| PNG rendered from HTML/CSS so the labels stay legible and inspectable. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import base64 |
| import html |
| import io |
| import json |
| import os |
| import subprocess |
| import tempfile |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json" |
| DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png" |
| DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample" |
| DROPBOX_SAMPLE_DIR = Path.home() / "Library/CloudStorage/Dropbox/Ropedia/data/sample/xperience-10m-sample" |
| DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png" |
| CANVAS_WIDTH = 1800 |
| CANVAS_HEIGHT = 6600 |
| THUMB_WIDTH = 880 |
| THUMB_HEIGHT = 520 |
|
|
|
|
| GROUPS = [ |
| { |
| "name": "Label + State", |
| "tone": "teal", |
| "color": "#9bdfff", |
| "soft": "#071d20", |
| "tasks": [ |
| ("timeline_action", "supervised"), |
| ("timeline_subtask", "supervised"), |
| ("next_action", "supervised"), |
| ], |
| }, |
| { |
| "name": "Prediction + Reconstruction", |
| "tone": "blue", |
| "color": "#ccffa0", |
| "soft": "#10210a", |
| "tasks": [ |
| ("hand_trajectory_forecast", "forecast"), |
| ("modality_reconstruction", "forecast"), |
| ("contact_prediction", "supervised"), |
| ], |
| }, |
| { |
| "name": "Grounding + Retrieval", |
| "tone": "amber", |
| "color": "#7ae5c3", |
| "soft": "#092019", |
| "tasks": [ |
| ("caption_grounding", "retrieval"), |
| ("cross_modal_retrieval", "retrieval"), |
| ("object_relevance", "supervised"), |
| ], |
| }, |
| { |
| "name": "Temporal Diagnostics", |
| "tone": "red", |
| "color": "#d8f4a5", |
| "soft": "#1b210d", |
| "tasks": [ |
| ("transition_detection", "diagnostic"), |
| ("temporal_order", "diagnostic"), |
| ("misalignment_detection", "diagnostic"), |
| ], |
| }, |
| ] |
|
|
| MODALITIES = [ |
| ("video", "visual stream", "6 synchronized camera MP4 streams", "RGB/fisheye/stereo frame statistics"), |
| ("audio", "acoustic stream", "audio stream embedded in MP4", "audio feature group"), |
| ("depth", "geometry map", "depth map + confidence channel", "spatial geometry feature block"), |
| ("pose / SLAM", "camera pose", "trajectory + sparse SLAM map", "position + orientation features"), |
| ("motion capture", "human motion", "body + hand joint tracks", "3D mocap feature statistics"), |
| ("inertial", "wearable sensor", "accelerometer + gyroscope", "wearable motion statistics"), |
| ("language", "semantic annotation", "object tags + action captions", "task labels + semantic targets"), |
| ] |
|
|
| HAND_EDGES = [ |
| (0, 1), (1, 2), (2, 3), (3, 4), |
| (0, 5), (5, 6), (6, 7), (7, 8), |
| (0, 9), (9, 10), (10, 11), (11, 12), |
| (0, 13), (13, 14), (14, 15), (15, 16), |
| (0, 17), (17, 18), (18, 19), (19, 20), |
| ] |
|
|
|
|
| def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str: |
| buffer = io.BytesIO() |
| save_kwargs = {"format": fmt} |
| if fmt.upper() in {"JPEG", "JPG"}: |
| save_kwargs.update({"quality": quality, "optimize": True}) |
| image.save(buffer, **save_kwargs) |
| encoded = base64.b64encode(buffer.getvalue()).decode("ascii") |
| mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png" |
| return f"data:image/{mime};base64,{encoded}" |
|
|
|
|
| def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(2, 5, 2)): |
| from PIL import Image |
|
|
| return Image.new("RGB", size, color) |
|
|
|
|
| def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)): |
| from PIL import ImageOps |
|
|
| return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5)) |
|
|
|
|
| def read_video_frame(video_path: Path, frame_index: int = 2400): |
| import cv2 |
| from PIL import Image |
|
|
| cap = cv2.VideoCapture(str(video_path)) |
| if not cap.isOpened(): |
| raise RuntimeError(f"Could not open video: {video_path}") |
| total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0) |
| if total: |
| frame_index = max(0, min(frame_index, total - 1)) |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) |
| ok, frame = cap.read() |
| cap.release() |
| if not ok: |
| raise RuntimeError(f"Could not read frame {frame_index} from {video_path}") |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
| return Image.fromarray(frame) |
|
|
|
|
| def draw_label(draw, xy, text, fill=(244, 248, 239), size=18): |
| from PIL import ImageFont |
|
|
| try: |
| font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size) |
| except Exception: |
| font = ImageFont.load_default() |
| draw.text(xy, text, fill=fill, font=font) |
|
|
|
|
| def video_thumb(sample_dir: Path) -> str: |
| from PIL import Image, ImageDraw |
|
|
| gutter = 18 |
| panel_width = (THUMB_WIDTH - gutter) // 2 |
| fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (panel_width, THUMB_HEIGHT)) |
| stereo_path = sample_dir / "stereo_left.mp4" |
| stereo = fit_image(read_video_frame(stereo_path, 2450), (panel_width, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy() |
| canvas = make_canvas() |
| canvas.paste(fish, (0, 0)) |
| canvas.paste(stereo, (panel_width + gutter, 0)) |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| draw.rounded_rectangle((panel_width - 4, 0, panel_width + gutter + 4, THUMB_HEIGHT), radius=0, fill=(2, 5, 2, 220)) |
| draw_label(draw, (18, 20), "fisheye", fill=(255, 255, 255), size=22) |
| draw_label(draw, (panel_width + gutter + 18, 20), "stereo", fill=(255, 255, 255), size=22) |
| return image_data_uri(canvas, "JPEG") |
|
|
|
|
| def colorize(values): |
| import numpy as np |
|
|
| stops = np.array([ |
| [2, 5, 2], |
| [58, 136, 102], |
| [122, 229, 195], |
| [167, 240, 120], |
| [216, 244, 165], |
| ], dtype=np.float32) |
| x = np.clip(values, 0, 1) |
| scaled = x * (len(stops) - 1) |
| lo = np.floor(scaled).astype(int) |
| hi = np.clip(lo + 1, 0, len(stops) - 1) |
| frac = scaled - lo |
| rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None] |
| return rgb.astype("uint8") |
|
|
|
|
| def depth_thumb(h5) -> str: |
| import numpy as np |
| from PIL import Image, ImageDraw |
|
|
| gutter = 18 |
| panel_width = (THUMB_WIDTH - gutter) // 2 |
| frame = np.array(h5["depth/depth"][2450], dtype=np.float32) |
| valid = np.isfinite(frame) |
| lo, hi = np.percentile(frame[valid], [3, 97]) |
| norm = (frame - lo) / max(hi - lo, 1e-6) |
| rgb = colorize(norm) |
| depth = fit_image(Image.fromarray(rgb), (panel_width, THUMB_HEIGHT)) |
| conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8) |
| conf_img = Image.fromarray(conf, mode="L").convert("RGB") |
| conf_img = fit_image(conf_img, (panel_width, THUMB_HEIGHT)) |
| canvas = make_canvas() |
| canvas.paste(depth, (0, 0)) |
| canvas.paste(conf_img, (panel_width + gutter, 0)) |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| draw.rounded_rectangle((0, 0, 158, 44), radius=8, fill=(2, 5, 2, 178)) |
| draw.rounded_rectangle((panel_width + gutter, 0, panel_width + gutter + 220, 44), radius=8, fill=(2, 5, 2, 178)) |
| draw_label(draw, (14, 11), "depth", fill=(255, 255, 255), size=22) |
| draw_label(draw, (panel_width + gutter + 14, 11), "confidence", fill=(255, 255, 255), size=22) |
| return image_data_uri(canvas, "JPEG") |
|
|
|
|
| def audio_thumb(sample_dir: Path) -> str: |
| import numpy as np |
| from PIL import ImageDraw |
|
|
| canvas = make_canvas() |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| try: |
| raw = subprocess.run( |
| [ |
| "ffmpeg", |
| "-v", |
| "error", |
| "-ss", |
| "45", |
| "-t", |
| "6", |
| "-i", |
| str(sample_dir / "fisheye_cam0.mp4"), |
| "-ac", |
| "1", |
| "-ar", |
| "16000", |
| "-f", |
| "s16le", |
| "pipe:1", |
| ], |
| check=True, |
| stdout=subprocess.PIPE, |
| ).stdout |
| samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) |
| if len(samples) == 0: |
| raise RuntimeError("empty audio stream") |
| samples = samples / max(float(np.max(np.abs(samples))), 1.0) |
| bins = 220 |
| trimmed = samples[: bins * max(1, len(samples) // bins)] |
| chunks = np.array_split(trimmed, bins) |
| rms = np.array([np.sqrt(np.mean(chunk * chunk)) if len(chunk) else 0.0 for chunk in chunks]) |
| waveform = np.array([float(np.mean(chunk)) if len(chunk) else 0.0 for chunk in chunks]) |
| baseline = THUMB_HEIGHT - 72 |
| for i, value in enumerate(rms): |
| x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36) |
| h = 14 + np.clip(value * 158, 0, 158) |
| draw.line((x, baseline, x, baseline - h), fill=(167, 240, 120, 170), width=2) |
| points = [] |
| for i, value in enumerate(waveform): |
| x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36) |
| y = 126 - np.clip(value, -1, 1) * 82 |
| points.append((x, y)) |
| draw.line(points, fill=(122, 229, 195, 220), width=2) |
| except Exception: |
| for i in range(48): |
| x = 22 + i * 8 |
| h = 16 + (i % 7) * 7 |
| draw.rounded_rectangle((x, THUMB_HEIGHT - 72 - h, x + 4, THUMB_HEIGHT - 72), radius=2, fill=(167, 240, 120, 170)) |
| draw_label(draw, (18, 18), "Audio waveform", fill=(244, 248, 239), size=22) |
| return image_data_uri(canvas, "PNG") |
|
|
|
|
| def normalize_points(points, width, height, pad=16): |
| import numpy as np |
|
|
| xy = points[:, :2].copy() |
| lo = np.percentile(xy, 2, axis=0) |
| hi = np.percentile(xy, 98, axis=0) |
| span = np.maximum(hi - lo, 1e-6) |
| norm = (xy - lo) / span |
| norm = np.clip(norm, 0, 1) |
| norm[:, 1] = 1 - norm[:, 1] |
| out = np.empty_like(norm) |
| out[:, 0] = pad + norm[:, 0] * (width - pad * 2) |
| out[:, 1] = pad + norm[:, 1] * (height - pad * 2) |
| return out |
|
|
|
|
| def slam_thumb(h5) -> str: |
| import numpy as np |
| from PIL import ImageDraw |
|
|
| canvas = make_canvas() |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| points = np.array(h5["slam/point_cloud"], dtype=np.float64) |
| points = points[np.isfinite(points).all(axis=1)] |
| if len(points) > 2600: |
| points = points[np.linspace(0, len(points) - 1, 2600).astype(int)] |
| xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT) |
| z = points[:, 1] |
| z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6) |
| colors = colorize(z_norm) |
| for (x, y), color in zip(xy, colors): |
| draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,)) |
| traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64) |
| traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT) |
| for a, b in zip(traj_xy[:-1], traj_xy[1:]): |
| draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 205), width=2) |
| draw_label(draw, (18, 18), "camera pose + SLAM map", fill=(244, 248, 239), size=22) |
| return image_data_uri(canvas, "PNG") |
|
|
|
|
| def imu_thumb(h5) -> str: |
| import numpy as np |
| from PIL import ImageDraw |
|
|
| canvas = make_canvas() |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| key_idx = int(h5["imu/keyframe_indices"][2450]) |
| accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64) |
| gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64) |
| series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]] |
| colors = [(167, 240, 120), (122, 229, 195), (155, 223, 255), (216, 244, 165), (244, 248, 239), (165, 175, 162)] |
| for row in range(6): |
| y = 68 + row * 44 |
| draw.line((18, y, THUMB_WIDTH - 18, y), fill=(167, 240, 120, 48), width=1) |
| for values, color in zip(series, colors): |
| values = values[:420] |
| if len(values) < 2: |
| continue |
| lo, hi = np.percentile(values, [3, 97]) |
| norm = (values - lo) / max(hi - lo, 1e-6) |
| pts = [] |
| for i, v in enumerate(norm): |
| x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36) |
| y = THUMB_HEIGHT - 48 - np.clip(v, 0, 1) * (THUMB_HEIGHT - 116) |
| pts.append((x, y)) |
| draw.line(pts, fill=color + (200,), width=2) |
| draw_label(draw, (18, 18), "inertial accel / gyro", fill=(244, 248, 239), size=22) |
| return image_data_uri(canvas, "PNG") |
|
|
|
|
| def mocap_thumb(h5) -> str: |
| import numpy as np |
| from PIL import ImageDraw |
|
|
| canvas = make_canvas() |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| body = np.array(h5["full_body_mocap/keypoints"][2450], dtype=np.float32) |
| left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32) |
| right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32) |
| all_points = np.concatenate([body, left, right], axis=0) |
| lo = np.percentile(all_points[:, :2], 2, axis=0) |
| hi = np.percentile(all_points[:, :2], 98, axis=0) |
| span = np.maximum(hi - lo, 1e-6) |
|
|
| def project(points, x_offset, width): |
| xy = (points[:, :2] - lo) / span |
| xy[:, 1] = 1 - xy[:, 1] |
| xy[:, 0] = x_offset + xy[:, 0] * width |
| xy[:, 1] = 72 + xy[:, 1] * (THUMB_HEIGHT - 136) |
| return xy |
|
|
| body_xy = project(body, 28, 270) |
| for x, y in body_xy: |
| draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=(167, 240, 120, 185)) |
| for a, b in zip(body_xy[:-1], body_xy[1:]): |
| draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 82), width=1) |
|
|
| for points, x_offset, color in [(left, 392, (122, 229, 195)), (right, 562, (216, 244, 165))]: |
| xy = project(points, x_offset, 126) |
| for a, b in HAND_EDGES: |
| draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (180,), width=2) |
| for x, y in xy: |
| draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=color + (220,)) |
| draw_label(draw, (18, 18), "body + hand mocap", fill=(244, 248, 239), size=22) |
| return image_data_uri(canvas, "PNG") |
|
|
|
|
| def text_thumb(h5) -> str: |
| from PIL import ImageDraw |
|
|
| width = THUMB_WIDTH |
| raw = h5["caption"][()] |
| if isinstance(raw, bytes): |
| raw = raw.decode("utf-8", errors="replace") |
| data = json.loads(raw) |
| segment = data["segments"][0] |
| objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5] |
| actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2] |
| canvas = make_canvas((width, THUMB_HEIGHT)) |
| draw = ImageDraw.Draw(canvas, "RGBA") |
| draw_label(draw, (28, 24), "language annotation", fill=(244, 248, 239), size=28) |
| y = 82 |
| for label in objects: |
| chip_width = 52 + len(label) * 16 |
| draw.rounded_rectangle((28, y, 28 + chip_width, y + 38), radius=8, fill=(7, 18, 7, 235), outline=(167, 240, 120, 170), width=2) |
| draw_label(draw, (44, y + 8), label, fill=(244, 248, 239), size=18) |
| y += 47 |
| x = 340 |
| y = 92 |
| for action in actions: |
| wrapped = action[:66] + ("..." if len(action) > 66 else "") |
| draw.rounded_rectangle((x, y, width - 28, y + 54), radius=9, fill=(7, 18, 7, 235), outline=(122, 229, 195, 180), width=2) |
| draw_label(draw, (x + 22, y + 15), wrapped, fill=(244, 248, 239), size=20) |
| y += 68 |
| return image_data_uri(canvas, "PNG") |
|
|
|
|
| def load_sample_thumbnails(sample_dir: Path | None) -> dict[str, str]: |
| if sample_dir is None or not sample_dir.exists(): |
| return {} |
| hdf5_path = sample_dir / "annotation.hdf5" |
| required = [sample_dir / "fisheye_cam0.mp4", hdf5_path] |
| if not all(path.exists() for path in required): |
| return {} |
| try: |
| import h5py |
|
|
| thumbnails = {"video": video_thumb(sample_dir), "audio": audio_thumb(sample_dir)} |
| with h5py.File(hdf5_path, "r") as h5: |
| thumbnails.update({ |
| "depth": depth_thumb(h5), |
| "pose / SLAM": slam_thumb(h5), |
| "motion capture": mocap_thumb(h5), |
| "inertial": imu_thumb(h5), |
| "language": text_thumb(h5), |
| }) |
| return thumbnails |
| except Exception as exc: |
| print(f"Warning: could not build sample modality thumbnails: {exc}") |
| return {} |
|
|
|
|
| def valid_sample_dir(sample_dir: Path | None) -> bool: |
| if sample_dir is None: |
| return False |
| return (sample_dir / "annotation.hdf5").exists() and (sample_dir / "fisheye_cam0.mp4").exists() |
|
|
|
|
| def resolve_sample_dir(sample_dir: Path | None) -> Path | None: |
| candidates: list[Path] = [] |
| env_sample_dir = os.environ.get("XPERIENCE10M_SAMPLE_DIR") |
| if env_sample_dir: |
| candidates.append(Path(env_sample_dir).expanduser()) |
| workspace = os.environ.get("WORKSPACE") |
| if workspace: |
| candidates.append(Path(workspace).expanduser() / "data/sample/xperience-10m-sample") |
| if sample_dir is not None: |
| candidates.append(sample_dir) |
| candidates.extend([ |
| DEFAULT_SAMPLE_DIR, |
| DROPBOX_SAMPLE_DIR, |
| ]) |
| for candidate in candidates: |
| if valid_sample_dir(candidate): |
| return candidate |
| return sample_dir |
|
|
|
|
| def load_summary() -> dict: |
| return json.loads(SUMMARY_PATH.read_text(encoding="utf-8")) |
|
|
|
|
| def fmt(value: float) -> str: |
| return f"{float(value):.4f}" |
|
|
|
|
| def metric_for(task_name: str, metrics: dict) -> tuple[str, str]: |
| if task_name == "hand_trajectory_forecast": |
| return "MPJPE", fmt(metrics["mpjpe"]) |
| if task_name == "cross_modal_retrieval": |
| return "top-5", fmt(metrics["top5_accuracy"]) |
| if task_name == "caption_grounding": |
| return "MRR", fmt(metrics["mrr"]) |
| if task_name == "object_relevance": |
| return "micro-F1", fmt(metrics["micro_f1"]) |
| if task_name == "modality_reconstruction": |
| return "R2", fmt(metrics["r2"]) |
| if task_name in {"temporal_order", "misalignment_detection"}: |
| return "F1", fmt(metrics["f1"]) |
| if "macro_f1" in metrics: |
| return "macro-F1", fmt(metrics["macro_f1"]) |
| if "accuracy" in metrics: |
| return "accuracy", fmt(metrics["accuracy"]) |
| raise KeyError(f"No main metric configured for {task_name}") |
|
|
|
|
| def short_io(task_name: str, metrics: dict) -> str: |
| custom = { |
| "timeline_action": "all featurized modalities -> action label", |
| "timeline_subtask": "all featurized modalities -> subtask label", |
| "transition_detection": "all featurized modalities -> boundary vs steady", |
| "next_action": "window at t -> action at t+20 frames", |
| "hand_trajectory_forecast": "all featurized modalities -> future hand joints", |
| "contact_prediction": "non-contact modalities -> contact state", |
| "object_relevance": "non-caption feature blocks -> relevant objects", |
| "caption_grounding": "text query -> matching sensor window", |
| "cross_modal_retrieval": "motion / IMU / camera -> depth / video match", |
| "modality_reconstruction": "motion / IMU / camera -> depth / video vector", |
| "temporal_order": "two adjacent windows -> correct order", |
| "misalignment_detection": "motion + visual pair -> aligned or shifted", |
| } |
| return custom.get(task_name, metrics.get("input", "")) |
|
|
|
|
| def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int, neural_metrics: dict | None = None) -> str: |
| label, value = metric_for(task_name, metrics) |
| neural_html = "" |
| if neural_metrics and "error" not in neural_metrics: |
| neural_label, neural_value = metric_for(task_name, neural_metrics) |
| neural_html = f""" |
| <div class="metric neural"> |
| <span>NN {html.escape(neural_label)}</span> |
| <strong>{html.escape(neural_value)}</strong> |
| </div> |
| """ |
| io = short_io(task_name, metrics) |
| return f""" |
| <article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};"> |
| <div class="task-meta"> |
| <span class="index">{index:02d}</span> |
| <span class="kind">{html.escape(kind)}</span> |
| </div> |
| <h3>{html.escape(task_name)}</h3> |
| <p>{html.escape(io)}</p> |
| <div class="metric"> |
| <span>min {html.escape(label)}</span> |
| <strong>{html.escape(value)}</strong> |
| </div> |
| {neural_html} |
| </article> |
| """ |
|
|
|
|
| def modality_card(name: str, modality_type: str, sample_text: str, feature_text: str, index: int, thumbnail: str | None) -> str: |
| thumb_html = "" |
| if thumbnail: |
| thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>' |
| return f""" |
| <article class="modality"> |
| <div class="modality-heading"> |
| <div> |
| <span class="modality-index">{index:02d}</span> |
| <h3>{html.escape(name)}</h3> |
| </div> |
| <span class="modality-type">{html.escape(modality_type)}</span> |
| </div> |
| {thumb_html} |
| <div class="modality-copy"> |
| <div class="modality-row"> |
| <span>Sample contains</span> |
| <p>{html.escape(sample_text)}</p> |
| </div> |
| <div class="modality-row"> |
| <span>Current baseline use</span> |
| <p>{html.escape(feature_text)}</p> |
| </div> |
| </div> |
| </article> |
| """ |
|
|
|
|
| def build_html(summary: dict, base_image: Path | None, sample_dir: Path | None) -> str: |
| suite = summary["tasks"] |
| neural_suite = summary.get("neural_tasks", {}) |
| thumbnails = load_sample_thumbnails(sample_dir) |
| base_layer = "" |
| if base_image is not None and base_image.exists(): |
| base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>' |
| stats = [ |
| (f"{summary['num_frames']:,}", "frames"), |
| (f"{summary['num_windows']:,}", "windows"), |
| (f"{summary['feature_dim']:,}", "features"), |
| (f"{len(suite)}+{len(neural_suite)}", "min + NN tasks"), |
| ("70/30", "chronological split"), |
| ] |
| stats_html = "".join( |
| f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>" |
| for value, label in stats |
| ) |
| modalities_html = "".join( |
| modality_card(name, modality_type, sample_text, feature_text, index, thumbnails.get(name)) |
| for index, (name, modality_type, sample_text, feature_text) in enumerate(MODALITIES, start=1) |
| ) |
|
|
| task_index = 1 |
| families = [] |
| for group in GROUPS: |
| cards = [] |
| for task_name, kind in group["tasks"]: |
| cards.append(task_card(task_name, kind, suite[task_name], group, task_index, neural_suite.get(task_name))) |
| task_index += 1 |
| families.append( |
| f""" |
| <section class="family" style="--accent:{group['color']};--soft:{group['soft']};"> |
| <div class="family-head"> |
| <span>{html.escape(group['tone'])}</span> |
| <h2>{html.escape(group['name'])}</h2> |
| </div> |
| <div class="family-cards">{''.join(cards)}</div> |
| </section> |
| """ |
| ) |
|
|
| return f"""<!doctype html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1"> |
| <title>Xperience-10M 12-Task Episode Suite Infographic</title> |
| <style> |
| * {{ box-sizing: border-box; }} |
| html, |
| body {{ |
| margin: 0; |
| width: {CANVAS_WIDTH}px; |
| height: {CANVAS_HEIGHT}px; |
| background: #020502; |
| }} |
| body {{ |
| font-family: "Inter Tight", "Space Grotesk", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; |
| color: #f4f8ef; |
| text-rendering: optimizeLegibility; |
| }} |
| .canvas {{ |
| position: relative; |
| width: {CANVAS_WIDTH}px; |
| height: {CANVAS_HEIGHT}px; |
| overflow: hidden; |
| padding: 54px 64px 44px; |
| background: |
| radial-gradient(circle at 72% 10%, rgba(167,240,120,0.18), transparent 24%), |
| radial-gradient(circle at 20% 28%, rgba(255,255,255,0.10) 1px, transparent 2px), |
| #020502; |
| background-size: auto, 18px 18px, auto; |
| }} |
| .image-background {{ |
| position: absolute; |
| inset: 0; |
| background-position: center; |
| background-repeat: no-repeat; |
| background-size: cover; |
| opacity: 0.36; |
| filter: saturate(1.05) contrast(1.08) brightness(0.42); |
| }} |
| .content {{ |
| position: relative; |
| z-index: 1; |
| }} |
| .header {{ |
| display: grid; |
| grid-template-columns: 1.25fr 0.75fr; |
| gap: 44px; |
| align-items: end; |
| padding-bottom: 30px; |
| border-bottom: 1px solid rgba(167,240,120,0.20); |
| }} |
| .kicker {{ |
| display: inline-flex; |
| align-items: center; |
| gap: 12px; |
| color: #ccffa0; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 15px; |
| text-transform: uppercase; |
| letter-spacing: 0.08em; |
| }} |
| .kicker::before {{ |
| content: ""; |
| width: 44px; |
| height: 1px; |
| background: #ccffa0; |
| }} |
| h1 {{ |
| margin: 18px 0 0; |
| max-width: 930px; |
| font-size: 72px; |
| line-height: 0.95; |
| letter-spacing: 0; |
| }} |
| .subtitle {{ |
| margin: 18px 0 0; |
| max-width: 900px; |
| color: #dce8d7; |
| font-size: 23px; |
| line-height: 1.35; |
| font-weight: 520; |
| }} |
| .stats {{ |
| display: grid; |
| grid-template-columns: repeat(5, minmax(0, 1fr)); |
| gap: 10px; |
| }} |
| .stat {{ |
| min-height: 78px; |
| padding: 14px 15px; |
| border: 1px solid rgba(167,240,120,0.24); |
| background: rgba(7,18,7,0.80); |
| border-radius: 8px; |
| }} |
| .stat strong {{ |
| display: block; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 25px; |
| line-height: 1; |
| font-variant-numeric: tabular-nums; |
| }} |
| .stat span {{ |
| display: block; |
| margin-top: 8px; |
| color: #a5afa2; |
| font-size: 13px; |
| line-height: 1.15; |
| }} |
| .section-label {{ |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 12px; |
| align-items: start; |
| margin: 44px 0 24px; |
| color: #a5afa2; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 22px; |
| text-transform: uppercase; |
| letter-spacing: 0.08em; |
| }} |
| .section-label span:last-child {{ |
| max-width: 1400px; |
| color: #dce8d7; |
| text-transform: none; |
| letter-spacing: 0; |
| font-family: inherit; |
| font-size: 21px; |
| line-height: 1.42; |
| text-align: left; |
| }} |
| .modalities {{ |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 34px; |
| }} |
| .modality {{ |
| min-height: 560px; |
| padding: 34px; |
| border: 1px solid rgba(167,240,120,0.22); |
| background: rgba(7,18,7,0.84); |
| border-radius: 8px; |
| display: grid; |
| grid-template-columns: 880px minmax(0, 1fr); |
| grid-template-areas: |
| "thumb heading" |
| "thumb copy"; |
| column-gap: 46px; |
| row-gap: 28px; |
| align-items: start; |
| }} |
| .modality-thumb {{ |
| grid-area: thumb; |
| height: 492px; |
| overflow: hidden; |
| border: 1px solid rgba(167,240,120,0.16); |
| border-radius: 8px; |
| background: #020502; |
| }} |
| .modality-thumb img {{ |
| display: block; |
| width: 100%; |
| height: 100%; |
| object-fit: cover; |
| }} |
| .modality-index, |
| .index {{ |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-variant-numeric: tabular-nums; |
| }} |
| .modality-heading {{ |
| grid-area: heading; |
| display: flex; |
| align-items: start; |
| justify-content: space-between; |
| gap: 24px; |
| padding-bottom: 26px; |
| border-bottom: 1px solid rgba(167,240,120,0.16); |
| }} |
| .modality-index {{ |
| color: #a5afa2; |
| font-size: 24px; |
| }} |
| .modality-type {{ |
| color: #ccffa0; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 16px; |
| line-height: 1.15; |
| text-transform: uppercase; |
| letter-spacing: 0.08em; |
| text-align: right; |
| max-width: 330px; |
| padding-top: 8px; |
| }} |
| .modality h3 {{ |
| margin: 14px 0 0; |
| font-size: 76px; |
| line-height: 0.98; |
| text-transform: uppercase; |
| }} |
| .modality-copy {{ |
| grid-area: copy; |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 22px; |
| }} |
| .modality-row {{ |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 10px; |
| align-items: baseline; |
| padding: 22px 24px; |
| border: 1px solid rgba(167,240,120,0.16); |
| border-radius: 8px; |
| background: rgba(2,5,2,0.40); |
| }} |
| .modality-row span {{ |
| display: block; |
| color: #a5afa2; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 16px; |
| letter-spacing: 0.06em; |
| line-height: 1.25; |
| text-transform: uppercase; |
| }} |
| .modality-row p {{ |
| margin: 0; |
| color: #dce8d7; |
| font-size: 40px; |
| font-weight: 650; |
| line-height: 1.15; |
| }} |
| .shared-band {{ |
| display: grid; |
| grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr; |
| gap: 12px; |
| align-items: center; |
| margin-top: 30px; |
| padding: 14px; |
| border: 1px solid rgba(167,240,120,0.22); |
| background: rgba(7,18,7,0.72); |
| border-radius: 8px; |
| }} |
| .step {{ |
| min-height: 62px; |
| padding: 13px 15px; |
| background: rgba(7,18,7,0.92); |
| border: 1px solid rgba(167,240,120,0.16); |
| border-radius: 8px; |
| }} |
| .step strong {{ |
| display: block; |
| font-size: 17px; |
| line-height: 1.1; |
| }} |
| .step span {{ |
| display: block; |
| margin-top: 5px; |
| color: #a5afa2; |
| font-size: 13px; |
| }} |
| .arrow {{ |
| color: #ccffa0; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 22px; |
| }} |
| .families {{ |
| display: grid; |
| grid-template-columns: repeat(2, minmax(0, 1fr)); |
| gap: 24px; |
| margin-top: 30px; |
| }} |
| .family {{ |
| padding: 20px; |
| border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502); |
| background: rgba(7,18,7,0.82); |
| border-radius: 8px; |
| }} |
| .family-head {{ |
| display: flex; |
| align-items: end; |
| justify-content: space-between; |
| gap: 16px; |
| min-height: 66px; |
| padding-bottom: 16px; |
| border-bottom: 1px solid color-mix(in srgb, var(--accent) 24%, #020502); |
| }} |
| .family-head span {{ |
| color: var(--accent); |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 12px; |
| text-transform: uppercase; |
| letter-spacing: 0.08em; |
| }} |
| .family-head h2 {{ |
| margin: 0; |
| color: var(--accent); |
| font-size: 32px; |
| line-height: 1.02; |
| text-align: right; |
| }} |
| .family-cards {{ |
| display: grid; |
| gap: 16px; |
| margin-top: 18px; |
| }} |
| .task-card {{ |
| min-height: 178px; |
| padding: 18px 20px; |
| border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502); |
| background: linear-gradient(180deg, rgba(10,24,10,0.96), color-mix(in srgb, var(--soft) 24%, #071207)); |
| border-radius: 8px; |
| }} |
| .task-meta {{ |
| display: flex; |
| align-items: center; |
| justify-content: space-between; |
| gap: 12px; |
| }} |
| .index {{ |
| color: #a5afa2; |
| font-size: 12px; |
| }} |
| .kind {{ |
| display: inline-flex; |
| align-items: center; |
| height: 24px; |
| padding: 0 9px; |
| border-radius: 6px; |
| border: 1px solid color-mix(in srgb, var(--accent) 40%, #020502); |
| color: var(--accent); |
| background: rgba(2,5,2,0.48); |
| text-transform: uppercase; |
| font-size: 11px; |
| line-height: 1; |
| font-weight: 830; |
| }} |
| .task-card h3 {{ |
| margin: 12px 0 0; |
| color: #f4f8ef; |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 21px; |
| line-height: 1.18; |
| overflow-wrap: anywhere; |
| }} |
| .task-card p {{ |
| margin: 11px 0 0; |
| min-height: 39px; |
| color: #dce8d7; |
| font-size: 15px; |
| line-height: 1.28; |
| font-weight: 560; |
| }} |
| .metric {{ |
| display: inline-flex; |
| align-items: baseline; |
| gap: 10px; |
| margin-top: 10px; |
| min-height: 32px; |
| padding: 7px 10px; |
| border-radius: 8px; |
| border: 1px solid color-mix(in srgb, var(--accent) 42%, #020502); |
| background: rgba(2,5,2,0.42); |
| }} |
| .metric.neural {{ |
| margin-left: 8px; |
| border-color: rgba(255,255,255,0.20); |
| background: rgba(255,255,255,0.08); |
| }} |
| .metric span {{ |
| color: #a5afa2; |
| font-size: 13px; |
| font-weight: 760; |
| }} |
| .metric strong {{ |
| color: var(--accent); |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| font-size: 20px; |
| line-height: 1; |
| font-weight: 860; |
| font-variant-numeric: tabular-nums; |
| }} |
| .footer {{ |
| display: flex; |
| align-items: center; |
| justify-content: space-between; |
| gap: 32px; |
| margin-top: 22px; |
| padding-top: 20px; |
| border-top: 1px solid rgba(167,240,120,0.20); |
| color: #a5afa2; |
| font-size: 18px; |
| line-height: 1.35; |
| font-weight: 620; |
| }} |
| .footer code {{ |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; |
| color: #020502; |
| background: #ccffa0; |
| border: 1px solid #ccffa0; |
| border-radius: 7px; |
| padding: 6px 9px; |
| white-space: nowrap; |
| }} |
| </style> |
| </head> |
| <body> |
| <main class="canvas" aria-label="Ropedia Xperience-10M 12-task suite infographic"> |
| {base_layer} |
| <div class="content"> |
| <header class="header"> |
| <div> |
| <div class="kicker">verified single-episode task suite</div> |
| <h1>Ropedia Xperience-10M 12-task suite</h1> |
| <p class="subtitle">A clean map from synchronized multimodal windows to 12 research task heads, comparing minimal heads with neural MLP results. Next milestone: Qwen3-Omni fine-tuning with sensor-bridge evaluation.</p> |
| </div> |
| <div class="stats">{stats_html}</div> |
| </header> |
| |
| <section class="shared-band" aria-label="shared processing contract"> |
| <div class="step"><strong>raw public episode</strong><span>video, audio, depth, pose, mocap, IMU, language</span></div> |
| <div class="arrow">-></div> |
| <div class="step"><strong>20-frame windows</strong><span>stride 5, chronological order</span></div> |
| <div class="arrow">-></div> |
| <div class="step"><strong>{summary['feature_dim']:,}-d vector</strong><span>current manifest includes audio features</span></div> |
| <div class="arrow">-></div> |
| <div class="step"><strong>12 minimal + NN heads</strong><span>softmax/ridge/logistic plus PyTorch MLP</span></div> |
| </section> |
| |
| <div class="section-label"> |
| <span>12 task families</span> |
| <span>Every task below has a minimal baseline and a neural MLP head over the same aligned window contract, making the suite easy to compare, extend, and scale to held-out episodes.</span> |
| </div> |
| <section class="families">{''.join(families)}</section> |
| |
| <div class="section-label"> |
| <span>Xperience-10M modalities</span> |
| <span>Public-sample thumbnails are enlarged here so each data stream is legible. Audio is present in the sample MP4 stream and is now extracted into the current baseline manifest.</span> |
| </div> |
| <section class="modalities">{modalities_html}</section> |
| |
| <footer class="footer"> |
| <span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span> |
| <code>results/episode_task_suite/summary_report.json</code> |
| </footer> |
| </div> |
| </main> |
| </body> |
| </html> |
| """ |
|
|
|
|
| def render_html(html_path: Path, output_path: Path) -> None: |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| subprocess.run( |
| [ |
| "npx", |
| "--yes", |
| "playwright", |
| "screenshot", |
| "--full-page", |
| f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}", |
| html_path.resolve().as_uri(), |
| str(output_path), |
| ], |
| check=True, |
| ) |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE) |
| parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR) |
| parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) |
| parser.add_argument("--html", type=Path) |
| parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.") |
| args = parser.parse_args() |
|
|
| summary = load_summary() |
| sample_dir = resolve_sample_dir(args.sample_dir) |
| html_text = build_html(summary, args.base_image, sample_dir) |
| if args.html is None: |
| with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle: |
| handle.write(html_text) |
| html_path = Path(handle.name) |
| else: |
| html_path = args.html |
| html_path.parent.mkdir(parents=True, exist_ok=True) |
| html_path.write_text(html_text, encoding="utf-8") |
|
|
| if not args.no_export: |
| render_html(html_path, args.output) |
| print(f"Wrote image: {args.output}") |
| print(f"Wrote render HTML: {html_path}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|