#!/usr/bin/env python3
"""
Render a polished Ropedia Xperience-10M 12-task infographic.

The task names, inputs, and metrics are read from
results/episode_task_suite/summary_report.json. The output is a deterministic
PNG rendered from HTML/CSS so the labels stay legible and inspectable.
"""

from __future__ import annotations

import argparse
import base64
import html
import io
import json
import os
import subprocess
import tempfile
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample"
DROPBOX_SAMPLE_DIR = Path.home() / "Library/CloudStorage/Dropbox/Ropedia/data/sample/xperience-10m-sample"
DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
CANVAS_WIDTH = 1800
CANVAS_HEIGHT = 6600
THUMB_WIDTH = 880
THUMB_HEIGHT = 520


GROUPS = [
    {
        "name": "Label + State",
        "tone": "teal",
        "color": "#9bdfff",
        "soft": "#071d20",
        "tasks": [
            ("timeline_action", "supervised"),
            ("timeline_subtask", "supervised"),
            ("next_action", "supervised"),
        ],
    },
    {
        "name": "Prediction + Reconstruction",
        "tone": "blue",
        "color": "#ccffa0",
        "soft": "#10210a",
        "tasks": [
            ("hand_trajectory_forecast", "forecast"),
            ("modality_reconstruction", "forecast"),
            ("contact_prediction", "supervised"),
        ],
    },
    {
        "name": "Grounding + Retrieval",
        "tone": "amber",
        "color": "#7ae5c3",
        "soft": "#092019",
        "tasks": [
            ("caption_grounding", "retrieval"),
            ("cross_modal_retrieval", "retrieval"),
            ("object_relevance", "supervised"),
        ],
    },
    {
        "name": "Temporal Diagnostics",
        "tone": "red",
        "color": "#d8f4a5",
        "soft": "#1b210d",
        "tasks": [
            ("transition_detection", "diagnostic"),
            ("temporal_order", "diagnostic"),
            ("misalignment_detection", "diagnostic"),
        ],
    },
]

MODALITIES = [
    ("video", "visual stream", "6 synchronized camera MP4 streams", "RGB/fisheye/stereo frame statistics"),
    ("audio", "acoustic stream", "audio stream embedded in MP4", "audio feature group"),
    ("depth", "geometry map", "depth map + confidence channel", "spatial geometry feature block"),
    ("pose / SLAM", "camera pose", "trajectory + sparse SLAM map", "position + orientation features"),
    ("motion capture", "human motion", "body + hand joint tracks", "3D mocap feature statistics"),
    ("inertial", "wearable sensor", "accelerometer + gyroscope", "wearable motion statistics"),
    ("language", "semantic annotation", "object tags + action captions", "task labels + semantic targets"),
]

HAND_EDGES = [
    (0, 1), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (0, 9), (9, 10), (10, 11), (11, 12),
    (0, 13), (13, 14), (14, 15), (15, 16),
    (0, 17), (17, 18), (18, 19), (19, 20),
]


def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str:
    buffer = io.BytesIO()
    save_kwargs = {"format": fmt}
    if fmt.upper() in {"JPEG", "JPG"}:
        save_kwargs.update({"quality": quality, "optimize": True})
    image.save(buffer, **save_kwargs)
    encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
    mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png"
    return f"data:image/{mime};base64,{encoded}"


def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(2, 5, 2)):
    from PIL import Image

    return Image.new("RGB", size, color)


def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)):
    from PIL import ImageOps

    return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5))


def read_video_frame(video_path: Path, frame_index: int = 2400):
    import cv2
    from PIL import Image

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Could not open video: {video_path}")
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    if total:
        frame_index = max(0, min(frame_index, total - 1))
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
    ok, frame = cap.read()
    cap.release()
    if not ok:
        raise RuntimeError(f"Could not read frame {frame_index} from {video_path}")
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    return Image.fromarray(frame)


def draw_label(draw, xy, text, fill=(244, 248, 239), size=18):
    from PIL import ImageFont

    try:
        font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size)
    except Exception:
        font = ImageFont.load_default()
    draw.text(xy, text, fill=fill, font=font)


def video_thumb(sample_dir: Path) -> str:
    from PIL import Image, ImageDraw

    gutter = 18
    panel_width = (THUMB_WIDTH - gutter) // 2
    fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (panel_width, THUMB_HEIGHT))
    stereo_path = sample_dir / "stereo_left.mp4"
    stereo = fit_image(read_video_frame(stereo_path, 2450), (panel_width, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy()
    canvas = make_canvas()
    canvas.paste(fish, (0, 0))
    canvas.paste(stereo, (panel_width + gutter, 0))
    draw = ImageDraw.Draw(canvas, "RGBA")
    draw.rounded_rectangle((panel_width - 4, 0, panel_width + gutter + 4, THUMB_HEIGHT), radius=0, fill=(2, 5, 2, 220))
    draw_label(draw, (18, 20), "fisheye", fill=(255, 255, 255), size=22)
    draw_label(draw, (panel_width + gutter + 18, 20), "stereo", fill=(255, 255, 255), size=22)
    return image_data_uri(canvas, "JPEG")


def colorize(values):
    import numpy as np

    stops = np.array([
        [2, 5, 2],
        [58, 136, 102],
        [122, 229, 195],
        [167, 240, 120],
        [216, 244, 165],
    ], dtype=np.float32)
    x = np.clip(values, 0, 1)
    scaled = x * (len(stops) - 1)
    lo = np.floor(scaled).astype(int)
    hi = np.clip(lo + 1, 0, len(stops) - 1)
    frac = scaled - lo
    rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None]
    return rgb.astype("uint8")


def depth_thumb(h5) -> str:
    import numpy as np
    from PIL import Image, ImageDraw

    gutter = 18
    panel_width = (THUMB_WIDTH - gutter) // 2
    frame = np.array(h5["depth/depth"][2450], dtype=np.float32)
    valid = np.isfinite(frame)
    lo, hi = np.percentile(frame[valid], [3, 97])
    norm = (frame - lo) / max(hi - lo, 1e-6)
    rgb = colorize(norm)
    depth = fit_image(Image.fromarray(rgb), (panel_width, THUMB_HEIGHT))
    conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8)
    conf_img = Image.fromarray(conf, mode="L").convert("RGB")
    conf_img = fit_image(conf_img, (panel_width, THUMB_HEIGHT))
    canvas = make_canvas()
    canvas.paste(depth, (0, 0))
    canvas.paste(conf_img, (panel_width + gutter, 0))
    draw = ImageDraw.Draw(canvas, "RGBA")
    draw.rounded_rectangle((0, 0, 158, 44), radius=8, fill=(2, 5, 2, 178))
    draw.rounded_rectangle((panel_width + gutter, 0, panel_width + gutter + 220, 44), radius=8, fill=(2, 5, 2, 178))
    draw_label(draw, (14, 11), "depth", fill=(255, 255, 255), size=22)
    draw_label(draw, (panel_width + gutter + 14, 11), "confidence", fill=(255, 255, 255), size=22)
    return image_data_uri(canvas, "JPEG")


def audio_thumb(sample_dir: Path) -> str:
    import numpy as np
    from PIL import ImageDraw

    canvas = make_canvas()
    draw = ImageDraw.Draw(canvas, "RGBA")
    try:
        raw = subprocess.run(
            [
                "ffmpeg",
                "-v",
                "error",
                "-ss",
                "45",
                "-t",
                "6",
                "-i",
                str(sample_dir / "fisheye_cam0.mp4"),
                "-ac",
                "1",
                "-ar",
                "16000",
                "-f",
                "s16le",
                "pipe:1",
            ],
            check=True,
            stdout=subprocess.PIPE,
        ).stdout
        samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
        if len(samples) == 0:
            raise RuntimeError("empty audio stream")
        samples = samples / max(float(np.max(np.abs(samples))), 1.0)
        bins = 220
        trimmed = samples[: bins * max(1, len(samples) // bins)]
        chunks = np.array_split(trimmed, bins)
        rms = np.array([np.sqrt(np.mean(chunk * chunk)) if len(chunk) else 0.0 for chunk in chunks])
        waveform = np.array([float(np.mean(chunk)) if len(chunk) else 0.0 for chunk in chunks])
        baseline = THUMB_HEIGHT - 72
        for i, value in enumerate(rms):
            x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
            h = 14 + np.clip(value * 158, 0, 158)
            draw.line((x, baseline, x, baseline - h), fill=(167, 240, 120, 170), width=2)
        points = []
        for i, value in enumerate(waveform):
            x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
            y = 126 - np.clip(value, -1, 1) * 82
            points.append((x, y))
        draw.line(points, fill=(122, 229, 195, 220), width=2)
    except Exception:
        for i in range(48):
            x = 22 + i * 8
            h = 16 + (i % 7) * 7
            draw.rounded_rectangle((x, THUMB_HEIGHT - 72 - h, x + 4, THUMB_HEIGHT - 72), radius=2, fill=(167, 240, 120, 170))
    draw_label(draw, (18, 18), "Audio waveform", fill=(244, 248, 239), size=22)
    return image_data_uri(canvas, "PNG")


def normalize_points(points, width, height, pad=16):
    import numpy as np

    xy = points[:, :2].copy()
    lo = np.percentile(xy, 2, axis=0)
    hi = np.percentile(xy, 98, axis=0)
    span = np.maximum(hi - lo, 1e-6)
    norm = (xy - lo) / span
    norm = np.clip(norm, 0, 1)
    norm[:, 1] = 1 - norm[:, 1]
    out = np.empty_like(norm)
    out[:, 0] = pad + norm[:, 0] * (width - pad * 2)
    out[:, 1] = pad + norm[:, 1] * (height - pad * 2)
    return out


def slam_thumb(h5) -> str:
    import numpy as np
    from PIL import ImageDraw

    canvas = make_canvas()
    draw = ImageDraw.Draw(canvas, "RGBA")
    points = np.array(h5["slam/point_cloud"], dtype=np.float64)
    points = points[np.isfinite(points).all(axis=1)]
    if len(points) > 2600:
        points = points[np.linspace(0, len(points) - 1, 2600).astype(int)]
    xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
    z = points[:, 1]
    z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6)
    colors = colorize(z_norm)
    for (x, y), color in zip(xy, colors):
        draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,))
    traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64)
    traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
    for a, b in zip(traj_xy[:-1], traj_xy[1:]):
        draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 205), width=2)
    draw_label(draw, (18, 18), "camera pose + SLAM map", fill=(244, 248, 239), size=22)
    return image_data_uri(canvas, "PNG")


def imu_thumb(h5) -> str:
    import numpy as np
    from PIL import ImageDraw

    canvas = make_canvas()
    draw = ImageDraw.Draw(canvas, "RGBA")
    key_idx = int(h5["imu/keyframe_indices"][2450])
    accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
    gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
    series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]]
    colors = [(167, 240, 120), (122, 229, 195), (155, 223, 255), (216, 244, 165), (244, 248, 239), (165, 175, 162)]
    for row in range(6):
        y = 68 + row * 44
        draw.line((18, y, THUMB_WIDTH - 18, y), fill=(167, 240, 120, 48), width=1)
    for values, color in zip(series, colors):
        values = values[:420]
        if len(values) < 2:
            continue
        lo, hi = np.percentile(values, [3, 97])
        norm = (values - lo) / max(hi - lo, 1e-6)
        pts = []
        for i, v in enumerate(norm):
            x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36)
            y = THUMB_HEIGHT - 48 - np.clip(v, 0, 1) * (THUMB_HEIGHT - 116)
            pts.append((x, y))
        draw.line(pts, fill=color + (200,), width=2)
    draw_label(draw, (18, 18), "inertial accel / gyro", fill=(244, 248, 239), size=22)
    return image_data_uri(canvas, "PNG")


def mocap_thumb(h5) -> str:
    import numpy as np
    from PIL import ImageDraw

    canvas = make_canvas()
    draw = ImageDraw.Draw(canvas, "RGBA")
    body = np.array(h5["full_body_mocap/keypoints"][2450], dtype=np.float32)
    left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32)
    right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32)
    all_points = np.concatenate([body, left, right], axis=0)
    lo = np.percentile(all_points[:, :2], 2, axis=0)
    hi = np.percentile(all_points[:, :2], 98, axis=0)
    span = np.maximum(hi - lo, 1e-6)

    def project(points, x_offset, width):
        xy = (points[:, :2] - lo) / span
        xy[:, 1] = 1 - xy[:, 1]
        xy[:, 0] = x_offset + xy[:, 0] * width
        xy[:, 1] = 72 + xy[:, 1] * (THUMB_HEIGHT - 136)
        return xy

    body_xy = project(body, 28, 270)
    for x, y in body_xy:
        draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=(167, 240, 120, 185))
    for a, b in zip(body_xy[:-1], body_xy[1:]):
        draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 82), width=1)

    for points, x_offset, color in [(left, 392, (122, 229, 195)), (right, 562, (216, 244, 165))]:
        xy = project(points, x_offset, 126)
        for a, b in HAND_EDGES:
            draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (180,), width=2)
        for x, y in xy:
            draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=color + (220,))
    draw_label(draw, (18, 18), "body + hand mocap", fill=(244, 248, 239), size=22)
    return image_data_uri(canvas, "PNG")


def text_thumb(h5) -> str:
    from PIL import ImageDraw

    width = THUMB_WIDTH
    raw = h5["caption"][()]
    if isinstance(raw, bytes):
        raw = raw.decode("utf-8", errors="replace")
    data = json.loads(raw)
    segment = data["segments"][0]
    objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5]
    actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2]
    canvas = make_canvas((width, THUMB_HEIGHT))
    draw = ImageDraw.Draw(canvas, "RGBA")
    draw_label(draw, (28, 24), "language annotation", fill=(244, 248, 239), size=28)
    y = 82
    for label in objects:
        chip_width = 52 + len(label) * 16
        draw.rounded_rectangle((28, y, 28 + chip_width, y + 38), radius=8, fill=(7, 18, 7, 235), outline=(167, 240, 120, 170), width=2)
        draw_label(draw, (44, y + 8), label, fill=(244, 248, 239), size=18)
        y += 47
    x = 340
    y = 92
    for action in actions:
        wrapped = action[:66] + ("..." if len(action) > 66 else "")
        draw.rounded_rectangle((x, y, width - 28, y + 54), radius=9, fill=(7, 18, 7, 235), outline=(122, 229, 195, 180), width=2)
        draw_label(draw, (x + 22, y + 15), wrapped, fill=(244, 248, 239), size=20)
        y += 68
    return image_data_uri(canvas, "PNG")


def load_sample_thumbnails(sample_dir: Path | None) -> dict[str, str]:
    if sample_dir is None or not sample_dir.exists():
        return {}
    hdf5_path = sample_dir / "annotation.hdf5"
    required = [sample_dir / "fisheye_cam0.mp4", hdf5_path]
    if not all(path.exists() for path in required):
        return {}
    try:
        import h5py

        thumbnails = {"video": video_thumb(sample_dir), "audio": audio_thumb(sample_dir)}
        with h5py.File(hdf5_path, "r") as h5:
            thumbnails.update({
                "depth": depth_thumb(h5),
                "pose / SLAM": slam_thumb(h5),
                "motion capture": mocap_thumb(h5),
                "inertial": imu_thumb(h5),
                "language": text_thumb(h5),
            })
        return thumbnails
    except Exception as exc:
        print(f"Warning: could not build sample modality thumbnails: {exc}")
        return {}


def valid_sample_dir(sample_dir: Path | None) -> bool:
    if sample_dir is None:
        return False
    return (sample_dir / "annotation.hdf5").exists() and (sample_dir / "fisheye_cam0.mp4").exists()


def resolve_sample_dir(sample_dir: Path | None) -> Path | None:
    candidates: list[Path] = []
    env_sample_dir = os.environ.get("XPERIENCE10M_SAMPLE_DIR")
    if env_sample_dir:
        candidates.append(Path(env_sample_dir).expanduser())
    workspace = os.environ.get("WORKSPACE")
    if workspace:
        candidates.append(Path(workspace).expanduser() / "data/sample/xperience-10m-sample")
    if sample_dir is not None:
        candidates.append(sample_dir)
    candidates.extend([
        DEFAULT_SAMPLE_DIR,
        DROPBOX_SAMPLE_DIR,
    ])
    for candidate in candidates:
        if valid_sample_dir(candidate):
            return candidate
    return sample_dir


def load_summary() -> dict:
    return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))


def fmt(value: float) -> str:
    return f"{float(value):.4f}"


def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
    if task_name == "hand_trajectory_forecast":
        return "MPJPE", fmt(metrics["mpjpe"])
    if task_name == "cross_modal_retrieval":
        return "top-5", fmt(metrics["top5_accuracy"])
    if task_name == "caption_grounding":
        return "MRR", fmt(metrics["mrr"])
    if task_name == "object_relevance":
        return "micro-F1", fmt(metrics["micro_f1"])
    if task_name == "modality_reconstruction":
        return "R2", fmt(metrics["r2"])
    if task_name in {"temporal_order", "misalignment_detection"}:
        return "F1", fmt(metrics["f1"])
    if "macro_f1" in metrics:
        return "macro-F1", fmt(metrics["macro_f1"])
    if "accuracy" in metrics:
        return "accuracy", fmt(metrics["accuracy"])
    raise KeyError(f"No main metric configured for {task_name}")


def short_io(task_name: str, metrics: dict) -> str:
    custom = {
        "timeline_action": "all featurized modalities -> action label",
        "timeline_subtask": "all featurized modalities -> subtask label",
        "transition_detection": "all featurized modalities -> boundary vs steady",
        "next_action": "window at t -> action at t+20 frames",
        "hand_trajectory_forecast": "all featurized modalities -> future hand joints",
        "contact_prediction": "non-contact modalities -> contact state",
        "object_relevance": "non-caption feature blocks -> relevant objects",
        "caption_grounding": "text query -> matching sensor window",
        "cross_modal_retrieval": "motion / IMU / camera -> depth / video match",
        "modality_reconstruction": "motion / IMU / camera -> depth / video vector",
        "temporal_order": "two adjacent windows -> correct order",
        "misalignment_detection": "motion + visual pair -> aligned or shifted",
    }
    return custom.get(task_name, metrics.get("input", ""))


def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int, neural_metrics: dict | None = None) -> str:
    label, value = metric_for(task_name, metrics)
    neural_html = ""
    if neural_metrics and "error" not in neural_metrics:
        neural_label, neural_value = metric_for(task_name, neural_metrics)
        neural_html = f"""
        <div class="metric neural">
          <span>NN {html.escape(neural_label)}</span>
          <strong>{html.escape(neural_value)}</strong>
        </div>
        """
    io = short_io(task_name, metrics)
    return f"""
      <article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};">
        <div class="task-meta">
          <span class="index">{index:02d}</span>
          <span class="kind">{html.escape(kind)}</span>
        </div>
        <h3>{html.escape(task_name)}</h3>
        <p>{html.escape(io)}</p>
        <div class="metric">
          <span>min {html.escape(label)}</span>
          <strong>{html.escape(value)}</strong>
        </div>
        {neural_html}
      </article>
    """


def modality_card(name: str, modality_type: str, sample_text: str, feature_text: str, index: int, thumbnail: str | None) -> str:
    thumb_html = ""
    if thumbnail:
        thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>'
    return f"""
      <article class="modality">
        <div class="modality-heading">
          <div>
            <span class="modality-index">{index:02d}</span>
            <h3>{html.escape(name)}</h3>
          </div>
          <span class="modality-type">{html.escape(modality_type)}</span>
        </div>
        {thumb_html}
        <div class="modality-copy">
          <div class="modality-row">
            <span>Sample contains</span>
            <p>{html.escape(sample_text)}</p>
          </div>
          <div class="modality-row">
            <span>Current baseline use</span>
            <p>{html.escape(feature_text)}</p>
          </div>
        </div>
      </article>
    """


def build_html(summary: dict, base_image: Path | None, sample_dir: Path | None) -> str:
    suite = summary["tasks"]
    neural_suite = summary.get("neural_tasks", {})
    thumbnails = load_sample_thumbnails(sample_dir)
    base_layer = ""
    if base_image is not None and base_image.exists():
        base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>'
    stats = [
        (f"{summary['num_frames']:,}", "frames"),
        (f"{summary['num_windows']:,}", "windows"),
        (f"{summary['feature_dim']:,}", "features"),
        (f"{len(suite)}+{len(neural_suite)}", "min + NN tasks"),
        ("70/30", "chronological split"),
    ]
    stats_html = "".join(
        f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>"
        for value, label in stats
    )
    modalities_html = "".join(
        modality_card(name, modality_type, sample_text, feature_text, index, thumbnails.get(name))
        for index, (name, modality_type, sample_text, feature_text) in enumerate(MODALITIES, start=1)
    )

    task_index = 1
    families = []
    for group in GROUPS:
        cards = []
        for task_name, kind in group["tasks"]:
            cards.append(task_card(task_name, kind, suite[task_name], group, task_index, neural_suite.get(task_name)))
            task_index += 1
        families.append(
            f"""
            <section class="family" style="--accent:{group['color']};--soft:{group['soft']};">
              <div class="family-head">
                <span>{html.escape(group['tone'])}</span>
                <h2>{html.escape(group['name'])}</h2>
              </div>
              <div class="family-cards">{''.join(cards)}</div>
            </section>
            """
        )

    return f"""<!doctype html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1">
  <title>Xperience-10M 12-Task Episode Suite Infographic</title>
  <style>
    * {{ box-sizing: border-box; }}
    html,
    body {{
      margin: 0;
      width: {CANVAS_WIDTH}px;
      height: {CANVAS_HEIGHT}px;
      background: #020502;
    }}
    body {{
      font-family: "Inter Tight", "Space Grotesk", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
      color: #f4f8ef;
      text-rendering: optimizeLegibility;
    }}
    .canvas {{
      position: relative;
      width: {CANVAS_WIDTH}px;
      height: {CANVAS_HEIGHT}px;
      overflow: hidden;
      padding: 54px 64px 44px;
      background:
        radial-gradient(circle at 72% 10%, rgba(167,240,120,0.18), transparent 24%),
        radial-gradient(circle at 20% 28%, rgba(255,255,255,0.10) 1px, transparent 2px),
        #020502;
      background-size: auto, 18px 18px, auto;
    }}
    .image-background {{
      position: absolute;
      inset: 0;
      background-position: center;
      background-repeat: no-repeat;
      background-size: cover;
      opacity: 0.36;
      filter: saturate(1.05) contrast(1.08) brightness(0.42);
    }}
    .content {{
      position: relative;
      z-index: 1;
    }}
    .header {{
      display: grid;
      grid-template-columns: 1.25fr 0.75fr;
      gap: 44px;
      align-items: end;
      padding-bottom: 30px;
      border-bottom: 1px solid rgba(167,240,120,0.20);
    }}
    .kicker {{
      display: inline-flex;
      align-items: center;
      gap: 12px;
      color: #ccffa0;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 15px;
      text-transform: uppercase;
      letter-spacing: 0.08em;
    }}
    .kicker::before {{
      content: "";
      width: 44px;
      height: 1px;
      background: #ccffa0;
    }}
    h1 {{
      margin: 18px 0 0;
      max-width: 930px;
      font-size: 72px;
      line-height: 0.95;
      letter-spacing: 0;
    }}
    .subtitle {{
      margin: 18px 0 0;
      max-width: 900px;
      color: #dce8d7;
      font-size: 23px;
      line-height: 1.35;
      font-weight: 520;
    }}
    .stats {{
      display: grid;
      grid-template-columns: repeat(5, minmax(0, 1fr));
      gap: 10px;
    }}
    .stat {{
      min-height: 78px;
      padding: 14px 15px;
      border: 1px solid rgba(167,240,120,0.24);
      background: rgba(7,18,7,0.80);
      border-radius: 8px;
    }}
    .stat strong {{
      display: block;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 25px;
      line-height: 1;
      font-variant-numeric: tabular-nums;
    }}
    .stat span {{
      display: block;
      margin-top: 8px;
      color: #a5afa2;
      font-size: 13px;
      line-height: 1.15;
    }}
    .section-label {{
      display: grid;
      grid-template-columns: 1fr;
      gap: 12px;
      align-items: start;
      margin: 44px 0 24px;
      color: #a5afa2;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 22px;
      text-transform: uppercase;
      letter-spacing: 0.08em;
    }}
    .section-label span:last-child {{
      max-width: 1400px;
      color: #dce8d7;
      text-transform: none;
      letter-spacing: 0;
      font-family: inherit;
      font-size: 21px;
      line-height: 1.42;
      text-align: left;
    }}
    .modalities {{
      display: grid;
      grid-template-columns: 1fr;
      gap: 34px;
    }}
    .modality {{
      min-height: 560px;
      padding: 34px;
      border: 1px solid rgba(167,240,120,0.22);
      background: rgba(7,18,7,0.84);
      border-radius: 8px;
      display: grid;
      grid-template-columns: 880px minmax(0, 1fr);
      grid-template-areas:
        "thumb heading"
        "thumb copy";
      column-gap: 46px;
      row-gap: 28px;
      align-items: start;
    }}
    .modality-thumb {{
      grid-area: thumb;
      height: 492px;
      overflow: hidden;
      border: 1px solid rgba(167,240,120,0.16);
      border-radius: 8px;
      background: #020502;
    }}
    .modality-thumb img {{
      display: block;
      width: 100%;
      height: 100%;
      object-fit: cover;
    }}
    .modality-index,
    .index {{
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-variant-numeric: tabular-nums;
    }}
    .modality-heading {{
      grid-area: heading;
      display: flex;
      align-items: start;
      justify-content: space-between;
      gap: 24px;
      padding-bottom: 26px;
      border-bottom: 1px solid rgba(167,240,120,0.16);
    }}
    .modality-index {{
      color: #a5afa2;
      font-size: 24px;
    }}
    .modality-type {{
      color: #ccffa0;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 16px;
      line-height: 1.15;
      text-transform: uppercase;
      letter-spacing: 0.08em;
      text-align: right;
      max-width: 330px;
      padding-top: 8px;
    }}
    .modality h3 {{
      margin: 14px 0 0;
      font-size: 76px;
      line-height: 0.98;
      text-transform: uppercase;
    }}
    .modality-copy {{
      grid-area: copy;
      display: grid;
      grid-template-columns: 1fr;
      gap: 22px;
    }}
    .modality-row {{
      display: grid;
      grid-template-columns: 1fr;
      gap: 10px;
      align-items: baseline;
      padding: 22px 24px;
      border: 1px solid rgba(167,240,120,0.16);
      border-radius: 8px;
      background: rgba(2,5,2,0.40);
    }}
    .modality-row span {{
      display: block;
      color: #a5afa2;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 16px;
      letter-spacing: 0.06em;
      line-height: 1.25;
      text-transform: uppercase;
    }}
    .modality-row p {{
      margin: 0;
      color: #dce8d7;
      font-size: 40px;
      font-weight: 650;
      line-height: 1.15;
    }}
    .shared-band {{
      display: grid;
      grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr;
      gap: 12px;
      align-items: center;
      margin-top: 30px;
      padding: 14px;
      border: 1px solid rgba(167,240,120,0.22);
      background: rgba(7,18,7,0.72);
      border-radius: 8px;
    }}
    .step {{
      min-height: 62px;
      padding: 13px 15px;
      background: rgba(7,18,7,0.92);
      border: 1px solid rgba(167,240,120,0.16);
      border-radius: 8px;
    }}
    .step strong {{
      display: block;
      font-size: 17px;
      line-height: 1.1;
    }}
    .step span {{
      display: block;
      margin-top: 5px;
      color: #a5afa2;
      font-size: 13px;
    }}
    .arrow {{
      color: #ccffa0;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 22px;
    }}
    .families {{
      display: grid;
      grid-template-columns: repeat(2, minmax(0, 1fr));
      gap: 24px;
      margin-top: 30px;
    }}
    .family {{
      padding: 20px;
      border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
      background: rgba(7,18,7,0.82);
      border-radius: 8px;
    }}
    .family-head {{
      display: flex;
      align-items: end;
      justify-content: space-between;
      gap: 16px;
      min-height: 66px;
      padding-bottom: 16px;
      border-bottom: 1px solid color-mix(in srgb, var(--accent) 24%, #020502);
    }}
    .family-head span {{
      color: var(--accent);
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 12px;
      text-transform: uppercase;
      letter-spacing: 0.08em;
    }}
    .family-head h2 {{
      margin: 0;
      color: var(--accent);
      font-size: 32px;
      line-height: 1.02;
      text-align: right;
    }}
    .family-cards {{
      display: grid;
      gap: 16px;
      margin-top: 18px;
    }}
    .task-card {{
      min-height: 178px;
      padding: 18px 20px;
      border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
      background: linear-gradient(180deg, rgba(10,24,10,0.96), color-mix(in srgb, var(--soft) 24%, #071207));
      border-radius: 8px;
    }}
    .task-meta {{
      display: flex;
      align-items: center;
      justify-content: space-between;
      gap: 12px;
    }}
    .index {{
      color: #a5afa2;
      font-size: 12px;
    }}
    .kind {{
      display: inline-flex;
      align-items: center;
      height: 24px;
      padding: 0 9px;
      border-radius: 6px;
      border: 1px solid color-mix(in srgb, var(--accent) 40%, #020502);
      color: var(--accent);
      background: rgba(2,5,2,0.48);
      text-transform: uppercase;
      font-size: 11px;
      line-height: 1;
      font-weight: 830;
    }}
    .task-card h3 {{
      margin: 12px 0 0;
      color: #f4f8ef;
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 21px;
      line-height: 1.18;
      overflow-wrap: anywhere;
    }}
    .task-card p {{
      margin: 11px 0 0;
      min-height: 39px;
      color: #dce8d7;
      font-size: 15px;
      line-height: 1.28;
      font-weight: 560;
    }}
    .metric {{
      display: inline-flex;
      align-items: baseline;
      gap: 10px;
      margin-top: 10px;
      min-height: 32px;
      padding: 7px 10px;
      border-radius: 8px;
      border: 1px solid color-mix(in srgb, var(--accent) 42%, #020502);
      background: rgba(2,5,2,0.42);
    }}
    .metric.neural {{
      margin-left: 8px;
      border-color: rgba(255,255,255,0.20);
      background: rgba(255,255,255,0.08);
    }}
    .metric span {{
      color: #a5afa2;
      font-size: 13px;
      font-weight: 760;
    }}
    .metric strong {{
      color: var(--accent);
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      font-size: 20px;
      line-height: 1;
      font-weight: 860;
      font-variant-numeric: tabular-nums;
    }}
    .footer {{
      display: flex;
      align-items: center;
      justify-content: space-between;
      gap: 32px;
      margin-top: 22px;
      padding-top: 20px;
      border-top: 1px solid rgba(167,240,120,0.20);
      color: #a5afa2;
      font-size: 18px;
      line-height: 1.35;
      font-weight: 620;
    }}
    .footer code {{
      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
      color: #020502;
      background: #ccffa0;
      border: 1px solid #ccffa0;
      border-radius: 7px;
      padding: 6px 9px;
      white-space: nowrap;
    }}
  </style>
</head>
<body>
  <main class="canvas" aria-label="Ropedia Xperience-10M 12-task suite infographic">
    {base_layer}
    <div class="content">
    <header class="header">
      <div>
        <div class="kicker">verified single-episode task suite</div>
        <h1>Ropedia Xperience-10M 12-task suite</h1>
        <p class="subtitle">A clean map from synchronized multimodal windows to 12 research task heads, comparing minimal heads with neural MLP results. Next milestone: Qwen3-Omni fine-tuning with sensor-bridge evaluation.</p>
      </div>
      <div class="stats">{stats_html}</div>
    </header>

    <section class="shared-band" aria-label="shared processing contract">
      <div class="step"><strong>raw public episode</strong><span>video, audio, depth, pose, mocap, IMU, language</span></div>
      <div class="arrow">-></div>
      <div class="step"><strong>20-frame windows</strong><span>stride 5, chronological order</span></div>
      <div class="arrow">-></div>
      <div class="step"><strong>{summary['feature_dim']:,}-d vector</strong><span>current manifest includes audio features</span></div>
      <div class="arrow">-></div>
      <div class="step"><strong>12 minimal + NN heads</strong><span>softmax/ridge/logistic plus PyTorch MLP</span></div>
    </section>

    <div class="section-label">
      <span>12 task families</span>
      <span>Every task below has a minimal baseline and a neural MLP head over the same aligned window contract, making the suite easy to compare, extend, and scale to held-out episodes.</span>
    </div>
    <section class="families">{''.join(families)}</section>

    <div class="section-label">
      <span>Xperience-10M modalities</span>
      <span>Public-sample thumbnails are enlarged here so each data stream is legible. Audio is present in the sample MP4 stream and is now extracted into the current baseline manifest.</span>
    </div>
    <section class="modalities">{modalities_html}</section>

    <footer class="footer">
      <span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span>
      <code>results/episode_task_suite/summary_report.json</code>
    </footer>
    </div>
  </main>
</body>
</html>
"""


def render_html(html_path: Path, output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    subprocess.run(
        [
            "npx",
            "--yes",
            "playwright",
            "screenshot",
            "--full-page",
            f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}",
            html_path.resolve().as_uri(),
            str(output_path),
        ],
        check=True,
    )


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
    parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR)
    parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
    parser.add_argument("--html", type=Path)
    parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.")
    args = parser.parse_args()

    summary = load_summary()
    sample_dir = resolve_sample_dir(args.sample_dir)
    html_text = build_html(summary, args.base_image, sample_dir)
    if args.html is None:
        with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
            handle.write(html_text)
            html_path = Path(handle.name)
    else:
        html_path = args.html
        html_path.parent.mkdir(parents=True, exist_ok=True)
        html_path.write_text(html_text, encoding="utf-8")

    if not args.no_export:
        render_html(html_path, args.output)
        print(f"Wrote image: {args.output}")
    print(f"Wrote render HTML: {html_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())