#!/usr/bin/env python3 """Validate the public 12-task card and walkthrough surface. This gate is deliberately about presentation integrity, not model quality. The repo keeps snake_case artifact ids for reproducibility, but the public website task cards and interactive player should use research-readable names and clear input/process/output wording. """ from __future__ import annotations import json import re from datetime import datetime, timezone from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[1] TASK_JSON = ROOT / "docs/data/task_walkthroughs.json" WEBSITE = ROOT / "docs/index.html" WALKTHROUGH_MD = ROOT / "results/episode_task_suite/task_walkthroughs/TASK_WALKTHROUGHS.md" OUTPUT = ROOT / "docs/data/task_surface_integrity.json" EXPECTED_TASKS = { "timeline_action": "Action Recognition", "timeline_subtask": "Procedure Step Recognition", "transition_detection": "Action Boundary Detection", "next_action": "Next-Action Prediction", "hand_trajectory_forecast": "Hand Trajectory Forecasting", "contact_prediction": "Contact State Prediction", "object_relevance": "Object Relevance Prediction", "caption_grounding": "Language Grounding", "cross_modal_retrieval": "Cross-Modal Retrieval", "modality_reconstruction": "Cross-Modal Reconstruction", "temporal_order": "Temporal Order Verification", "misalignment_detection": "Multimodal Synchronization Detection", } EXPECTED_EXTENSION_NAMES = { "body_motion_intensity": "Body and Hand Motion Intensity", "multi_view_consistency_retrieval": "Multi-View Consistency Retrieval", "action_phase_progress": "Action Phase Progress Estimation", "ego_motion_forecast": "Short-Horizon Ego-Motion Forecasting", } REQUIRED_TASK_FIELDS = { "display_name", "research_name", "task_family", "architecture_family", "primary_direction", "card_blurb", "input_short", "process_short", "output_short", "modalities", "poster_modality", "case_study", "input", "output", "middle_modules", "metric", "failure_mode", "artifact_id", "plain_goal", } DISPLAY_FIELDS = { "display_name", "research_name", "card_blurb", "input_short", "process_short", "output_short", "plain_goal", } ALLOWED_FAMILIES = {"supervised", "forecast", "retrieval", "diagnostic"} MODALITY_ASSETS = { "video": "docs/assets/modalities/video.jpg", "audio": "docs/assets/modalities/audio.png", "depth": "docs/assets/modalities/depth.jpg", "pose_slam": "docs/assets/modalities/pose_slam.png", "motion_capture": "docs/assets/modalities/motion_capture.png", "inertial": "docs/assets/modalities/inertial.png", "language": "docs/assets/modalities/language.png", } RAW_ID_PATTERN = re.compile(r"\b[a-z]+(?:_[a-z0-9]+)+\b") def load_json(path: Path) -> dict[str, Any]: return json.loads(path.read_text(encoding="utf-8")) def check(condition: bool, name: str, failures: list[dict[str, Any]], **details: Any) -> dict[str, Any]: record = {"name": name, "status": "pass" if condition else "fail", **details} if not condition: failures.append(record) return record def function_body(source: str, name: str) -> str: marker = f"function {name}(" start = source.find(marker) if start < 0: return "" brace = source.find("{", start) if brace < 0: return "" depth = 0 for index in range(brace, len(source)): char = source[index] if char == "{": depth += 1 elif char == "}": depth -= 1 if depth == 0: return source[start : index + 1] return source[start:] def validate_tasks(payload: dict[str, Any], failures: list[dict[str, Any]]) -> list[dict[str, Any]]: checks: list[dict[str, Any]] = [] tasks = payload.get("tasks", {}) checks.append(check(isinstance(tasks, dict), "tasks_object_present", failures)) if not isinstance(tasks, dict): return checks task_ids = set(tasks) checks.append( check( len(tasks) == len(EXPECTED_TASKS), "exactly_12_tasks", failures, observed=len(tasks), expected=len(EXPECTED_TASKS), ) ) checks.append( check( task_ids == set(EXPECTED_TASKS), "expected_task_ids_present", failures, missing=sorted(set(EXPECTED_TASKS) - task_ids), extra=sorted(task_ids - set(EXPECTED_TASKS)), ) ) for task_id, task in tasks.items(): if not isinstance(task, dict): checks.append(check(False, f"{task_id}: task_record_object", failures)) continue missing_fields = sorted(REQUIRED_TASK_FIELDS - set(task)) checks.append( check(not missing_fields, f"{task_id}: required_fields", failures, missing=missing_fields) ) expected_name = EXPECTED_TASKS.get(task_id) checks.append( check( task.get("display_name") == expected_name, f"{task_id}: human_readable_display_name", failures, expected=expected_name, observed=task.get("display_name"), ) ) checks.append( check( task.get("artifact_id") == task_id, f"{task_id}: artifact_id_matches_key", failures, observed=task.get("artifact_id"), ) ) for field in DISPLAY_FIELDS: value = str(task.get(field, "")) raw_hits = [hit for hit in RAW_ID_PATTERN.findall(value) if hit in EXPECTED_TASKS or hit in MODALITY_ASSETS] checks.append( check( not raw_hits, f"{task_id}: public_field_{field}_is_human_readable", failures, value=value, raw_hits=raw_hits, ) ) family = task.get("task_family") checks.append( check( family in ALLOWED_FAMILIES, f"{task_id}: known_task_family", failures, observed=family, allowed=sorted(ALLOWED_FAMILIES), ) ) modalities = task.get("modalities", []) checks.append( check( isinstance(modalities, list) and modalities, f"{task_id}: modality_list_present", failures, observed=modalities, ) ) if isinstance(modalities, list): unknown = [item for item in modalities if item not in MODALITY_ASSETS] missing_assets = [ MODALITY_ASSETS[item] for item in modalities if item in MODALITY_ASSETS and not (ROOT / MODALITY_ASSETS[item]).exists() ] checks.append( check( not unknown, f"{task_id}: known_modalities", failures, unknown=unknown, ) ) checks.append( check( not missing_assets, f"{task_id}: modality_assets_exist", failures, missing=missing_assets, ) ) checks.append( check( task.get("poster_modality") in modalities, f"{task_id}: poster_modality_in_task_modalities", failures, poster_modality=task.get("poster_modality"), modalities=modalities, ) ) metric = task.get("metric", {}) metric_ok = ( isinstance(metric, dict) and isinstance(metric.get("name"), str) and isinstance(metric.get("direction"), str) and isinstance(metric.get("minimal"), (int, float)) and isinstance(metric.get("neural_mlp"), (int, float)) ) checks.append( check( metric_ok, f"{task_id}: numeric_minimal_and_neural_metrics", failures, metric=metric, ) ) checks.append( check( isinstance(task.get("middle_modules"), list) and len(task.get("middle_modules", [])) >= 3, f"{task_id}: middle_modules_explain_process", failures, observed_count=len(task.get("middle_modules", [])) if isinstance(task.get("middle_modules"), list) else 0, ) ) return checks def validate_markdown(source: str, tasks: dict[str, Any], failures: list[dict[str, Any]]) -> list[dict[str, Any]]: checks: list[dict[str, Any]] = [] for task_id, display_name in EXPECTED_TASKS.items(): expected_heading = f"### {display_name} (`{task_id}`)" checks.append( check( expected_heading in source, f"markdown_heading_present:{task_id}", failures, expected=expected_heading, ) ) checks.append( check( source.count("### ") == len(EXPECTED_TASKS), "markdown_has_12_task_sections", failures, observed=source.count("### "), ) ) checks.append( check( all(str(task.get("case_study", "")) in source for task in tasks.values()), "markdown_contains_case_studies", failures, ) ) return checks def validate_website(source: str, failures: list[dict[str, Any]]) -> list[dict[str, Any]]: checks: list[dict[str, Any]] = [] required_markers = [ 'id="taskPlayer"', 'id="taskGrid"', 'id="walkthroughSelector"', 'id="playerStoryboard"', 'id="playerFrameChip"', 'id="playerFrameCaption"', 'id="playerScrub"', 'fetch("data/task_walkthroughs.json"', 'class="task-card"', 'class="task-card-media"', 'class="story-button', 'class="flow-step', 'id="playerPlay"', 'id="playerPrev"', 'id="playerNext"', ] for marker in required_markers: checks.append( check(marker in source, f"website_marker_present:{marker}", failures, marker=marker) ) task_card_renderer = function_body(source, "renderTaskCards") selector_renderer = function_body(source, "renderSelector") player_renderer = function_body(source, "renderPlayer") checks.append( check( "artifact-id" not in source, "website_no_artifact_id_css_or_markup", failures, ) ) checks.append( check( "artifact_id" not in task_card_renderer, "task_cards_do_not_render_artifact_ids", failures, ) ) checks.append( check( "task.display_name" in task_card_renderer and "task.research_name" in task_card_renderer, "task_cards_render_human_names", failures, ) ) checks.append( check( "task.input_short" in task_card_renderer and "task.process_short" in task_card_renderer and "task.output_short" in task_card_renderer, "task_cards_render_input_process_output", failures, ) ) checks.append( check( "task.poster_modality" in task_card_renderer and "task-card-media" in task_card_renderer, "task_cards_use_representative_modality_thumbnail", failures, ) ) checks.append( check( all( needle in player_renderer for needle in ["playerPoster", "middle_modules"] ) and all(needle in source for needle in ["playerProgress", "renderStageFrame(task, index)"]) and all(needle in source for needle in ['id="playerPlay"', 'id="playerPrev"', 'id="playerNext"']), "interactive_player_wired_to_task_metadata", failures, ) ) checks.append( check( all(needle in source for needle in ["function setActiveStage", "function advancePlayer", "playerScrub"]), "interactive_video_storyboard_controls_present", failures, ) ) checks.append( check( "task.display_name" in selector_renderer and "artifact_id" not in selector_renderer, "selector_uses_human_names", failures, ) ) for artifact_id, display_name in EXPECTED_EXTENSION_NAMES.items(): checks.append( check( f"