data-explorer / app.py
worldarchived's picture
Cross-links to both datasets + collection
3a28087 verified
Raw
History Blame Contribute Delete
4.92 kB
"""World Archive Mono sample — layer-switching data explorer (all 9 clips)."""
from __future__ import annotations
import json
import shutil
import tempfile
from functools import lru_cache
from pathlib import Path
import gradio as gr
import pandas as pd
from huggingface_hub import hf_hub_download, hf_hub_url
REPO = "WorldArchive/mono-india-workplace-sample"
LEROBOT_REPO = "WorldArchive/mono-india-workplace-lerobot"
TAGLINE = "Ground truth from the real economy."
CALENDLY = "https://calendly.com/algorithmsdheeraj/30min"
S3 = "https://ggn-egocentric-data-sample.s3.ap-south-1.amazonaws.com/sample_data_june"
# (clip_id, display label)
CLIPS = [
("sample_01_shuttle_tube_packaging", "01 · Shuttle packaging"),
("sample_02_industrial_sewing_machine", "02 · Industrial sewing"),
("sample_03_heatgun_and_batching", "03 · Heat gun & batching"),
("sample_04_garment_ironing_and_packing", "04 · Garment ironing"),
("sample_05_commercial_catering", "05 · Commercial catering"),
("sample_06_cane_weaving", "06 · Cane weaving"),
("sample_07_car_detailing", "07 · Car detailing"),
("sample_08_primer_and_painting", "08 · Primer & painting"),
("sample_09_denting_and_filing", "09 · Denting & filing"),
]
LAYERS = ["plain", "skeleton", "boxes"]
@lru_cache(maxsize=1)
def load_segments_df() -> pd.DataFrame:
path = hf_hub_download(REPO, "data/segments.parquet", repo_type="dataset")
return pd.read_parquet(path)
@lru_cache(maxsize=1)
def load_clips_df() -> pd.DataFrame:
path = hf_hub_download(REPO, "data/clips.parquet", repo_type="dataset")
return pd.read_parquet(path)
def clip_video_path(clip_id: str, layer: str) -> str:
rel = f"clips_preview/{clip_id}/{layer}.mp4"
try:
src = hf_hub_download(REPO, rel, repo_type="dataset")
dest = Path(tempfile.gettempdir()) / "wa_explorer" / clip_id / f"{layer}.mp4"
dest.parent.mkdir(parents=True, exist_ok=True)
if not dest.exists() or dest.stat().st_mtime < Path(src).stat().st_mtime:
shutil.copy2(src, dest)
return str(dest)
except Exception:
return hf_hub_url(REPO, rel, repo_type="dataset")
def clip_metadata(clip_id: str) -> str:
try:
df = load_clips_df()
row = df[df["clip_id"] == clip_id].iloc[0].to_dict()
return json.dumps(row, indent=2, default=str)
except Exception as e:
return json.dumps({"clip_id": clip_id, "error": str(e)})
def clip_segments(clip_id: str) -> str:
try:
df = load_segments_df()
video = f"{clip_id}.mp4"
rows = df[df["video"] == video].sort_values("start_sec")
lines = []
for _, r in rows.iterrows():
lines.append(
f"**{r['start_sec']:.1f}s–{r['end_sec']:.1f}s** · "
f"{r.get('action', '?')} **{r.get('object', '?')}**"
)
return "\n\n".join(lines[:12]) + ("\n\n_…and more in full pack_" if len(lines) > 12 else "")
except Exception as e:
return f"_Could not load segments: {e}_"
def update(clip_label: str, layer: str):
clip_id = next(c for c, label in CLIPS if label == clip_label)
try:
return clip_video_path(clip_id, layer), clip_metadata(clip_id), clip_segments(clip_id)
except Exception as e:
return None, json.dumps({"error": str(e)}), f"_Preview failed: {e}_"
with gr.Blocks(title="World Archive Data Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown(
f"""
# World Archive
### {TAGLINE}
**Mono Clear** — annotated egocentric manipulation from Indian workplaces (factory, kitchen, repair, craft).
| 9 clips | 218 segments | 8+ layers | LeRobot-ready |
|:-------:|:------------:|:---------:|:-------------:|
[Metadata dataset](https://huggingface.co/datasets/{REPO}) · [LeRobot mirror](https://huggingface.co/datasets/{LEROBOT_REPO}) · [Collection](https://huggingface.co/collections/WorldArchive/physical-ai-india)
[Book a call]({CALENDLY}) · shubham@worldarchive.co · [Full pack ~19 GB]({S3}/index.html) · [worldarchive.co](https://worldarchive.co)
```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
ds = LeRobotDataset("{LEROBOT_REPO}") # 9 episodes · 46k frames
```
"""
)
with gr.Row():
clip_dd = gr.Dropdown([l for _, l in CLIPS], value=CLIPS[0][1], label="Clip")
layer_dd = gr.Radio(LAYERS, value="plain", label="Layer", info="plain · hand skeleton · object boxes")
video = gr.Video(label="Preview (6s)", autoplay=True)
with gr.Row():
meta = gr.Code(label="Clip metadata", language="json", scale=1)
segs = gr.Markdown(label="Action segments", scale=1)
clip_dd.change(update, [clip_dd, layer_dd], [video, meta, segs])
layer_dd.change(update, [clip_dd, layer_dd], [video, meta, segs])
demo.load(update, [clip_dd, layer_dd], [video, meta, segs])
if __name__ == "__main__":
demo.launch()