"""World Archive Mono sample — layer-switching data explorer (all 9 clips).""" from __future__ import annotations import json import shutil import tempfile from functools import lru_cache from pathlib import Path import gradio as gr import pandas as pd from huggingface_hub import hf_hub_download, hf_hub_url REPO = "WorldArchive/mono-india-workplace-sample" LEROBOT_REPO = "WorldArchive/mono-india-workplace-lerobot" TAGLINE = "Ground truth from the real economy." CALENDLY = "https://calendly.com/algorithmsdheeraj/30min" S3 = "https://ggn-egocentric-data-sample.s3.ap-south-1.amazonaws.com/sample_data_june" # (clip_id, display label) CLIPS = [ ("sample_01_shuttle_tube_packaging", "01 · Shuttle packaging"), ("sample_02_industrial_sewing_machine", "02 · Industrial sewing"), ("sample_03_heatgun_and_batching", "03 · Heat gun & batching"), ("sample_04_garment_ironing_and_packing", "04 · Garment ironing"), ("sample_05_commercial_catering", "05 · Commercial catering"), ("sample_06_cane_weaving", "06 · Cane weaving"), ("sample_07_car_detailing", "07 · Car detailing"), ("sample_08_primer_and_painting", "08 · Primer & painting"), ("sample_09_denting_and_filing", "09 · Denting & filing"), ] LAYERS = ["plain", "skeleton", "boxes"] @lru_cache(maxsize=1) def load_segments_df() -> pd.DataFrame: path = hf_hub_download(REPO, "data/segments.parquet", repo_type="dataset") return pd.read_parquet(path) @lru_cache(maxsize=1) def load_clips_df() -> pd.DataFrame: path = hf_hub_download(REPO, "data/clips.parquet", repo_type="dataset") return pd.read_parquet(path) def clip_video_path(clip_id: str, layer: str) -> str: rel = f"clips_preview/{clip_id}/{layer}.mp4" try: src = hf_hub_download(REPO, rel, repo_type="dataset") dest = Path(tempfile.gettempdir()) / "wa_explorer" / clip_id / f"{layer}.mp4" dest.parent.mkdir(parents=True, exist_ok=True) if not dest.exists() or dest.stat().st_mtime < Path(src).stat().st_mtime: shutil.copy2(src, dest) return str(dest) except Exception: return hf_hub_url(REPO, rel, repo_type="dataset") def clip_metadata(clip_id: str) -> str: try: df = load_clips_df() row = df[df["clip_id"] == clip_id].iloc[0].to_dict() return json.dumps(row, indent=2, default=str) except Exception as e: return json.dumps({"clip_id": clip_id, "error": str(e)}) def clip_segments(clip_id: str) -> str: try: df = load_segments_df() video = f"{clip_id}.mp4" rows = df[df["video"] == video].sort_values("start_sec") lines = [] for _, r in rows.iterrows(): lines.append( f"**{r['start_sec']:.1f}s–{r['end_sec']:.1f}s** · " f"{r.get('action', '?')} **{r.get('object', '?')}**" ) return "\n\n".join(lines[:12]) + ("\n\n_…and more in full pack_" if len(lines) > 12 else "") except Exception as e: return f"_Could not load segments: {e}_" def update(clip_label: str, layer: str): clip_id = next(c for c, label in CLIPS if label == clip_label) try: return clip_video_path(clip_id, layer), clip_metadata(clip_id), clip_segments(clip_id) except Exception as e: return None, json.dumps({"error": str(e)}), f"_Preview failed: {e}_" with gr.Blocks(title="World Archive Data Explorer", theme=gr.themes.Soft()) as demo: gr.Markdown( f""" # World Archive ### {TAGLINE} **Mono Clear** — annotated egocentric manipulation from Indian workplaces (factory, kitchen, repair, craft). | 9 clips | 218 segments | 8+ layers | LeRobot-ready | |:-------:|:------------:|:---------:|:-------------:| [Metadata dataset](https://huggingface.co/datasets/{REPO}) · [LeRobot mirror](https://huggingface.co/datasets/{LEROBOT_REPO}) · [Collection](https://huggingface.co/collections/WorldArchive/physical-ai-india) [Book a call]({CALENDLY}) · shubham@worldarchive.co · [Full pack ~19 GB]({S3}/index.html) · [worldarchive.co](https://worldarchive.co) ```python from lerobot.datasets.lerobot_dataset import LeRobotDataset ds = LeRobotDataset("{LEROBOT_REPO}") # 9 episodes · 46k frames ``` """ ) with gr.Row(): clip_dd = gr.Dropdown([l for _, l in CLIPS], value=CLIPS[0][1], label="Clip") layer_dd = gr.Radio(LAYERS, value="plain", label="Layer", info="plain · hand skeleton · object boxes") video = gr.Video(label="Preview (6s)", autoplay=True) with gr.Row(): meta = gr.Code(label="Clip metadata", language="json", scale=1) segs = gr.Markdown(label="Action segments", scale=1) clip_dd.change(update, [clip_dd, layer_dd], [video, meta, segs]) layer_dd.change(update, [clip_dd, layer_dd], [video, meta, segs]) demo.load(update, [clip_dd, layer_dd], [video, meta, segs]) if __name__ == "__main__": demo.launch()