g1-moves / generate_data.py
exptech's picture
Pagination, preview-to-WASM swap, trained clips first
e6d1dc8 verified
#!/usr/bin/env python3
"""Generate data.json for the G1 Moves showcase site.
Reads manifest.json + scans the filesystem to determine which media
files (GIF/MP4) exist for each clip at each pipeline stage.
Usage:
python generate_data.py # from space/ directory
python generate_data.py --repo /path/to/g1-moves
"""
import argparse
import json
import re
from pathlib import Path
STAGES = ["capture", "retarget", "training", "policy"]
# Descriptive subtitles for karate moves (from README)
KARATE_SUBTITLES = {
"M_Move1": "Guard Combo",
"M_Move2": "Low Punch",
"M_Move3": "Horse Stance",
"M_Move4": "Spin Punch",
"M_Move5": "Twist Punch",
"M_Move6": "Spin Strike",
"M_Move7": "Rapid Punch",
"M_Move8": "Drop Spin",
"M_Move9": "Level Change",
"M_Move10": "Side Kick",
"M_Move11": "Blitz",
"M_Move17": "Double Strike",
"M_Move18": "Front Kick",
"M_Move19": "Slow Kata",
"M_Move20": "Open Strike",
"M_ShortMove12": "Quick Jab",
"M_ShortMove13": "Snap Kick",
"M_ShortMove14": "Light Punch",
"M_ShortMove15": "Drop Strike",
"M_ShortMove16": "Power Burst",
}
def format_display_name(clip_id: str) -> str:
"""Convert clip ID to human-readable name."""
name = re.sub(r"^[BJMV]_", "", clip_id)
name = re.sub(r"([a-z\d])([A-Z])", r"\1 \2", name)
name = re.sub(r"([A-Za-z])(\d)", r"\1 \2", name)
name = name.replace("_", " ")
return name
def get_media_file(clip_id: str, stage: str) -> str:
"""Return the expected GIF filename for a clip at a given stage."""
if stage == "capture":
return f"{clip_id}.gif"
return f"{clip_id}_{stage}.gif"
def scan_stage(repo: Path, category: str, clip_id: str, stage: str) -> dict | None:
"""Check for media files at a pipeline stage. Returns paths dict or None."""
stage_dir = repo / category / clip_id / stage
result = {}
gif_name = get_media_file(clip_id, stage)
gif_path = stage_dir / gif_name
if gif_path.exists():
result["gif"] = f"{category}/{clip_id}/{stage}/{gif_name}"
# Also check for MP4
mp4_name = gif_name.replace(".gif", ".mp4")
mp4_path = stage_dir / mp4_name
if mp4_path.exists():
result["mp4"] = f"{category}/{clip_id}/{stage}/{mp4_name}"
return result if result else None
def main():
parser = argparse.ArgumentParser(description="Generate site data from manifest")
parser.add_argument(
"--repo",
type=Path,
default=Path(__file__).parent.parent,
help="Path to g1-moves repository root",
)
args = parser.parse_args()
repo = args.repo.resolve()
manifest_path = repo / "manifest.json"
if not manifest_path.exists():
print(f"ERROR: manifest.json not found at {manifest_path}")
return
manifest = json.loads(manifest_path.read_text())
clips = []
stats = {"dance": 0, "karate": 0, "bonus": 0, "policies": 0, "total": 0}
for clip_id, info in sorted(manifest["clips"].items()):
cat = info["category"]
stats[cat] = stats.get(cat, 0) + 1
stats["total"] += 1
stages = {}
for stage in STAGES:
media = scan_stage(repo, cat, clip_id, stage)
if media:
stages[stage] = media
# Check for ONNX and PT policy files
onnx_path = repo / cat / clip_id / "policy" / f"{clip_id}_policy.onnx"
pt_path = repo / cat / clip_id / "policy" / f"{clip_id}_policy.pt"
has_onnx = onnx_path.exists()
has_pt = pt_path.exists()
has_policy = "policy" in stages or has_onnx or has_pt
if has_policy:
stats["policies"] += 1
subtitle = KARATE_SUBTITLES.get(clip_id)
display_name = format_display_name(clip_id)
if subtitle:
display_name = f"{display_name}: {subtitle}"
clips.append(
{
"id": clip_id,
"name": display_name,
"category": cat,
"performer": info.get("performer", "Unknown"),
"duration": info.get("duration_s", 0),
"fps": info.get("fps", 60),
"frames": info.get("frames", 0),
"stages": stages,
"has_policy": has_policy,
"has_onnx": has_onnx,
}
)
# Sort: categories grouped, then alphabetical
cat_order = {"dance": 0, "karate": 1, "bonus": 2}
clips.sort(key=lambda c: (cat_order.get(c["category"], 9), c["id"]))
data = {
"base_url": "https://huggingface.co/datasets/exptech/g1-moves/resolve/main",
"clips": clips,
"stats": stats,
}
output = Path(__file__).parent / "data.json"
output.write_text(json.dumps(data, indent=2))
print(f"Generated {output.name}: {len(clips)} clips, {stats['policies']} policies")
if __name__ == "__main__":
main()