#!/usr/bin/env python3
"""
Generate static SVG visualizations and website data for the Xperience-10M task suite.
No plotting dependencies are required; this uses only the Python standard
library so the repo stays easy to run.
The polished GitHub Pages homepage in docs/index.html is hand-curated and is
not overwritten by this script. This script refreshes docs/assets/*.svg,
docs/assets/charts/*.svg, and docs/data/summary_metrics.json.
"""
from __future__ import annotations
import html
import json
import textwrap
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
RESULTS = ROOT / "results"
DOCS = ROOT / "docs"
ASSETS = DOCS / "assets"
CHARTS = ASSETS / "charts"
OMNI_RELAY = {
"status": "selected_relay_in_progress",
"dataset": "ropedia-ai/xperience-10m",
"staging": "prepared_generic_host_to_host_transfer",
"training_target": "external_multi_gpu_training_host",
"selection_strategy": "stratified_round_robin_by_top_level_session",
"target_episodes": 128,
"selected_sessions": 128,
"candidate_scan_top_level_sessions": 802,
"valid_candidates": 12102,
"estimated_bytes": 298188841943,
"exclude": ["visualization.rrd"],
"access_status": "Full-dataset access is granted; selected multi-episode relay is in progress.",
"current_scope": "The selected-episode Qwen3-Omni fine-tune requires completed data staging and held-out evaluation.",
}
def read_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def svg_bar_chart(path: Path, title: str, rows: list[tuple[str, float]], x_label: str = "score", max_value: float | None = None) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
width = 1100
row_h = 34
top = 78
left = 310
right = 70
height = top + row_h * len(rows) + 70
max_value = max_value if max_value is not None else max([v for _, v in rows] + [1.0])
max_value = max(max_value, 1e-9)
plot_w = width - left - right
colors = ["#ccffa0", "#ffffff", "#7ae5c3", "#d8f4a5", "#9bdfff", "#ff8f7a"]
parts = [
f'")
path.write_text("\n".join(parts), encoding="utf-8")
FEATURE_DISPLAY_NAMES = {
"audio_fisheye_cam0_aac": "audio",
"caption_objects_interaction_text": "language text",
}
def display_feature_name(name: str) -> str:
return FEATURE_DISPLAY_NAMES.get(name, name.replace("_", " "))
def svg_feature_blocks(path: Path, feature_manifest: list[dict]) -> None:
rows = [(display_feature_name(block["name"]), float(block["dim"])) for block in feature_manifest]
svg_bar_chart(path, "Current Extracted Feature Blocks", rows, x_label="feature dimensions", max_value=max(v for _, v in rows) * 1.08)
def svg_pipeline_diagram(path: Path, summary: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
suite = summary["suite"]
task_count = len(suite["tasks"])
width, height = 1400, 760
boxes = [
(60, 110, 250, 132, "1. Raw public sample", [
"annotation.hdf5",
"6 MP4 videos with audio",
f"{suite['num_frames']:,} aligned frames",
], "#9bdfff"),
(365, 110, 250, 132, "2. HOMIE loader", [
"video, depth, pose",
"mocap, IMU, language",
"audio features",
], "#7ae5c3"),
(670, 110, 250, 132, "3. Window builder", [
f"{suite['window_frames']}-frame windows",
f"{suite['stride_frames']}-frame stride",
f"{suite['num_windows']:,} windows",
], "#ccffa0"),
(975, 110, 300, 132, "4. Feature vector", [
f"{suite['feature_dim']:,} dimensions",
f"{len(summary['feature_manifest'])} named blocks",
"audio represented",
"stored manifest",
], "#d8f4a5"),
(60, 380, 360, 168, "5. Baseline models", [
"motion-only action/subtask",
"current all-feature action/subtask",
"numpy softmax classifier",
"metrics and predictions",
], "#9bdfff"),
(520, 380, 360, 168, "6. Ropedia Xperience-10M suite", [
f"{task_count} supervised/self-supervised tasks",
"chronological split",
"retrieval, forecast, alignment",
"per-task artifacts",
], "#7ae5c3"),
(980, 380, 300, 168, "7. Published artifacts", [
"results/**/*.json/csv/npz",
"docs/data/summary_metrics.json",
"GitHub Pages dashboard",
"reproducibility check",
], "#ccffa0"),
]
parts = [
f'")
path.write_text("\n".join(parts), encoding="utf-8")
def feature_dim(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> int:
include = include or []
exclude = exclude or []
total = 0
for block in feature_manifest:
name = block["name"]
if include and not any(name == prefix or name.startswith(prefix) for prefix in include):
continue
if exclude and any(name == prefix or name.startswith(prefix) for prefix in exclude):
continue
total += int(block["dim"])
return total
def metric_text(task_name: str, metrics: dict) -> str:
if task_name == "hand_trajectory_forecast":
return f"MPJPE {metrics['mpjpe']:.4f}"
if task_name == "cross_modal_retrieval":
return f"top-5 {metrics['top5_accuracy']:.4f}"
if task_name == "caption_grounding":
return f"MRR {metrics['mrr']:.4f}"
if task_name == "object_relevance":
return f"micro-F1 {metrics['micro_f1']:.4f}"
if task_name == "modality_reconstruction":
return f"R2 {metrics['r2']:.4f}"
if task_name in {"temporal_order", "misalignment_detection"}:
return f"F1 {metrics['f1']:.4f}"
if "macro_f1" in metrics:
return f"macro-F1 {metrics['macro_f1']:.4f}"
if "accuracy" in metrics:
return f"accuracy {metrics['accuracy']:.4f}"
return "metric in summary_report.json"
def metric_text_with_neural(task_name: str, metrics: dict, neural_tasks: dict) -> str:
text = metric_text(task_name, metrics)
neural_metrics = neural_tasks.get(task_name)
if not neural_metrics or "error" in neural_metrics:
return text
return f"min {text}; NN {metric_text(task_name, neural_metrics)}"
def draw_text_block(parts: list[str], x: int, y: int, lines: list[str], size: int = 13, color: str = "#dce8d7", weight: str = "500", max_chars: int = 42, line_h: int = 18) -> int:
cursor = y
for line in lines:
wrapped = textwrap.wrap(line, width=max_chars) or [""]
for item in wrapped:
parts.append(f'{html.escape(item)}')
cursor += line_h
return cursor
def task_architecture_rows(summary: dict) -> list[dict]:
suite = summary["suite"]
tasks = suite["tasks"]
neural_tasks = suite.get("neural_tasks", {})
manifest = summary["feature_manifest"]
all_dim = int(suite["feature_dim"])
no_contact_text_dim = feature_dim(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
no_text_dim = feature_dim(manifest, exclude=["caption_objects_interaction_text"])
sensor_dim = no_text_dim
text_dim = feature_dim(manifest, include=["caption_objects_interaction_text"])
motion_dim = feature_dim(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
motion_audio_dim = feature_dim(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_", "audio_"])
visual_dim = feature_dim(manifest, include=["depth_confidence", "video_"])
visual_audio_dim = feature_dim(manifest, include=["depth_confidence", "video_", "audio_"])
pair_dim = all_dim * 3
align_dim = motion_dim + visual_audio_dim
return [
{
"task": "timeline_action",
"family": "softmax",
"input": f"X_all window, {all_dim:,}d",
"head": "minimal linear softmax; optional NN MLP softmax",
"output": f"current action class, {tasks['timeline_action']['num_classes']} classes",
"metric": metric_text_with_neural("timeline_action", tasks["timeline_action"], neural_tasks),
},
{
"task": "timeline_subtask",
"family": "softmax",
"input": f"X_all window, {all_dim:,}d",
"head": "minimal linear softmax; optional NN MLP softmax",
"output": f"current subtask class, {tasks['timeline_subtask']['num_classes']} classes",
"metric": metric_text_with_neural("timeline_subtask", tasks["timeline_subtask"], neural_tasks),
},
{
"task": "transition_detection",
"family": "softmax",
"input": f"X_all window, {all_dim:,}d",
"head": "minimal linear softmax; optional NN MLP softmax",
"output": "steady vs transition near action boundary",
"metric": f"{metric_text_with_neural('transition_detection', tasks['transition_detection'], neural_tasks)}; boundary-F1 {tasks['transition_detection']['boundary_f1']:.4f}",
},
{
"task": "next_action",
"family": "softmax",
"input": f"X_all at time t, {all_dim:,}d",
"head": "minimal linear softmax; optional NN MLP softmax",
"output": f"action at t+{tasks['next_action'].get('future_frames', 20)} frames",
"metric": metric_text_with_neural("next_action", tasks["next_action"], neural_tasks),
},
{
"task": "hand_trajectory_forecast",
"family": "ridge",
"input": f"X_all at time t, {all_dim:,}d",
"head": "minimal dual ridge; optional NN MLP regression",
"output": f"future hand joints, {tasks['hand_trajectory_forecast']['target_dim']}d",
"metric": metric_text_with_neural("hand_trajectory_forecast", tasks["hand_trajectory_forecast"], neural_tasks),
},
{
"task": "contact_prediction",
"family": "softmax",
"input": f"X without contact/text leakage, {no_contact_text_dim:,}d",
"head": "minimal linear softmax; optional NN MLP softmax",
"output": "any body contact in window; degenerate one-class sample",
"metric": metric_text_with_neural("contact_prediction", tasks["contact_prediction"], neural_tasks),
},
{
"task": "object_relevance",
"family": "multilabel",
"input": f"X without caption text, {no_text_dim:,}d",
"head": "minimal sigmoid logistic; optional NN MLP multilabel",
"output": f"multi-hot object set, {tasks['object_relevance']['num_objects']} objects",
"metric": metric_text_with_neural("object_relevance", tasks["object_relevance"], neural_tasks),
},
{
"task": "caption_grounding",
"family": "ridge+rank",
"input": f"sensor {sensor_dim:,}d -> text space {text_dim:,}d",
"head": "minimal ridge or NN MLP projection, then cosine rank",
"output": "text query retrieves matching time window",
"metric": metric_text_with_neural("caption_grounding", tasks["caption_grounding"], neural_tasks),
},
{
"task": "cross_modal_retrieval",
"family": "ridge+rank",
"input": f"motion/IMU/camera/audio {motion_audio_dim:,}d -> visual {visual_dim:,}d",
"head": "minimal ridge or NN MLP projection, then cosine rank",
"output": "retrieve matching depth/video window",
"metric": metric_text_with_neural("cross_modal_retrieval", tasks["cross_modal_retrieval"], neural_tasks),
},
{
"task": "modality_reconstruction",
"family": "ridge",
"input": f"motion/IMU/camera/audio {motion_audio_dim:,}d",
"head": "minimal dual ridge; optional NN MLP regression",
"output": f"depth/video feature vector, {visual_dim:,}d",
"metric": metric_text_with_neural("modality_reconstruction", tasks["modality_reconstruction"], neural_tasks),
},
{
"task": "temporal_order",
"family": "softmax",
"input": f"concat[x_t, x_t+1, diff], {pair_dim:,}d",
"head": "minimal binary softmax; optional NN MLP softmax",
"output": "correct vs reversed adjacent windows",
"metric": metric_text_with_neural("temporal_order", tasks["temporal_order"], neural_tasks),
},
{
"task": "misalignment_detection",
"family": "softmax",
"input": f"concat[motion_t, visual+audio_t/shifted], {align_dim:,}d",
"head": "minimal binary softmax; optional NN MLP softmax",
"output": "aligned vs shifted by 8 windows",
"metric": metric_text_with_neural("misalignment_detection", tasks["misalignment_detection"], neural_tasks),
},
]
def svg_task_architectures(path: Path, summary: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
suite = summary["suite"]
rows = task_architecture_rows(summary)
family_colors = {
"softmax": "#9bdfff",
"ridge": "#ccffa0",
"ridge+rank": "#7ae5c3",
"multilabel": "#d8f4a5",
}
width, height = 1500, 1840
parts = [
f'")
path.write_text("\n".join(parts), encoding="utf-8")
def collect_summary() -> dict:
all_action = read_json(RESULTS / "min_all_modalities_action_model/metrics.json")
all_subtask = read_json(RESULTS / "min_all_modalities_subtask_model/metrics.json")
min_action = read_json(RESULTS / "min_action_model/metrics.json")
min_subtask = read_json(RESULTS / "min_subtask_model/metrics.json")
suite = read_json(RESULTS / "episode_task_suite/summary_report.json")
manifest = read_json(RESULTS / "episode_task_suite/feature_manifest.json")
public_manifest = [
{**block, "name": display_feature_name(block["name"])}
for block in manifest
]
return {
"omni_relay": OMNI_RELAY,
"models": {
"motion_action": min_action,
"motion_subtask": min_subtask,
"all_modalities_action": all_action,
"all_modalities_subtask": all_subtask,
},
"suite": suite,
"feature_manifest": public_manifest,
}
def task_score(metrics: dict) -> float:
score = metrics.get("macro_f1", metrics.get("f1", metrics.get("micro_f1", metrics.get("top5_accuracy", metrics.get("r2", 0.0)))))
if score is None:
score = 0.0
return max(float(score), 0.0)
def generate_charts(summary: dict) -> None:
CHARTS.mkdir(parents=True, exist_ok=True)
svg_pipeline_diagram(ASSETS / "pipeline_diagram.svg", summary)
svg_task_architectures(ASSETS / "task_architectures.svg", summary)
model_rows = [
("Motion-only action macro-F1", summary["models"]["motion_action"]["macro_f1"]),
("Current all-feature action macro-F1", summary["models"]["all_modalities_action"]["macro_f1"]),
("Motion-only subtask macro-F1", summary["models"]["motion_subtask"]["macro_f1"]),
("Current all-feature subtask macro-F1", summary["models"]["all_modalities_subtask"]["macro_f1"]),
]
svg_bar_chart(CHARTS / "model_macro_f1.svg", "Minimal Model Macro-F1 Comparison", model_rows, max_value=1.0)
suite = summary["suite"]["tasks"]
task_rows = []
for task_name, metrics in suite.items():
task_rows.append((task_name, task_score(metrics)))
svg_bar_chart(CHARTS / "episode_task_scores.svg", "Ropedia Xperience-10M Suite: Main Scores", task_rows, max_value=1.0)
neural = summary["suite"].get("neural_tasks", {})
if neural:
neural_rows = [(task_name, task_score(metrics)) for task_name, metrics in neural.items() if "error" not in metrics]
if neural_rows:
svg_bar_chart(CHARTS / "episode_task_scores_neural_mlp.svg", "Ropedia Xperience-10M Suite: Neural MLP Main Scores", neural_rows, max_value=1.0)
comparison_rows = []
for task_name, metrics in suite.items():
comparison_rows.append((f"{task_name} minimal", task_score(metrics)))
neural_metrics = neural.get(task_name)
if neural_metrics and "error" not in neural_metrics:
comparison_rows.append((f"{task_name} neural", task_score(neural_metrics)))
if comparison_rows:
svg_bar_chart(CHARTS / "episode_task_scores_minimal_vs_neural.svg", "Episode Task Scores: Minimal vs Neural MLP", comparison_rows, max_value=1.0)
svg_feature_blocks(CHARTS / "feature_blocks.svg", summary["feature_manifest"])
retrieval = suite["cross_modal_retrieval"]
retrieval_rows = [
("top1", retrieval["top1_accuracy"]),
("top5", retrieval["top5_accuracy"]),
("top10", retrieval["top10_accuracy"]),
("MRR", retrieval["mrr"]),
]
svg_bar_chart(CHARTS / "cross_modal_retrieval.svg", "Cross-Modal Retrieval", retrieval_rows, max_value=1.0)
def write_summary_data(summary: dict) -> None:
DOCS.mkdir(parents=True, exist_ok=True)
(DOCS / "data").mkdir(parents=True, exist_ok=True)
(DOCS / "data/summary_metrics.json").write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
def main() -> int:
summary = collect_summary()
generate_charts(summary)
write_summary_data(summary)
print(f"Wrote pipeline diagram: {ASSETS / 'pipeline_diagram.svg'}")
print(f"Wrote task architectures diagram: {ASSETS / 'task_architectures.svg'}")
print(f"Wrote charts: {CHARTS}")
print(f"Wrote data: {DOCS / 'data/summary_metrics.json'}")
return 0
if __name__ == "__main__":
raise SystemExit(main())