Spaces:

WorldArchive
/

data-explorer

Runtime error

File size: 4,921 Bytes

1e3598e
bd21695
 
 
7a4136f
 
0e7d3e3
7a4136f
bd21695
 
0e7d3e3
7a4136f
bd21695
31abd36
0e7d3e3
1e3598e
 
 
 
 
bd21695
1e3598e
 
 
 
 
 
 
 
 
bd21695
 
0e7d3e3
 
 
 
 
 
 
 
 
 
 
 
bd21695
 
 
 
7a4136f
 
 
 
 
 
 
 
 
bd21695
 
 
0e7d3e3
 
 
 
 
1e3598e
bd21695
0e7d3e3
 
 
 
 
 
 
 
 
1e3598e
 
0e7d3e3
1e3598e
0e7d3e3
 
bd21695
 
 
 
7a4136f
1e3598e
7a4136f
1e3598e
bd21695
 
 
 
0e7d3e3
1e3598e
 
0e7d3e3
1e3598e
0e7d3e3
1e3598e
 
0e7d3e3
3a28087
 
 
0e7d3e3
 
 
1e3598e
0e7d3e3
bd21695
 
 
 
1e3598e
bd21695
0e7d3e3
1e3598e
 
0e7d3e3
 
 
bd21695

"""World Archive Mono sample — layer-switching data explorer (all 9 clips)."""
from __future__ import annotations

import json
import shutil
import tempfile
from functools import lru_cache
from pathlib import Path

import gradio as gr
import pandas as pd
from huggingface_hub import hf_hub_download, hf_hub_url

REPO = "WorldArchive/mono-india-workplace-sample"
LEROBOT_REPO = "WorldArchive/mono-india-workplace-lerobot"
TAGLINE = "Ground truth from the real economy."
CALENDLY = "https://calendly.com/algorithmsdheeraj/30min"
S3 = "https://ggn-egocentric-data-sample.s3.ap-south-1.amazonaws.com/sample_data_june"

# (clip_id, display label)
CLIPS = [
    ("sample_01_shuttle_tube_packaging", "01 · Shuttle packaging"),
    ("sample_02_industrial_sewing_machine", "02 · Industrial sewing"),
    ("sample_03_heatgun_and_batching", "03 · Heat gun & batching"),
    ("sample_04_garment_ironing_and_packing", "04 · Garment ironing"),
    ("sample_05_commercial_catering", "05 · Commercial catering"),
    ("sample_06_cane_weaving", "06 · Cane weaving"),
    ("sample_07_car_detailing", "07 · Car detailing"),
    ("sample_08_primer_and_painting", "08 · Primer & painting"),
    ("sample_09_denting_and_filing", "09 · Denting & filing"),
]
LAYERS = ["plain", "skeleton", "boxes"]


@lru_cache(maxsize=1)
def load_segments_df() -> pd.DataFrame:
    path = hf_hub_download(REPO, "data/segments.parquet", repo_type="dataset")
    return pd.read_parquet(path)


@lru_cache(maxsize=1)
def load_clips_df() -> pd.DataFrame:
    path = hf_hub_download(REPO, "data/clips.parquet", repo_type="dataset")
    return pd.read_parquet(path)


def clip_video_path(clip_id: str, layer: str) -> str:
    rel = f"clips_preview/{clip_id}/{layer}.mp4"
    try:
        src = hf_hub_download(REPO, rel, repo_type="dataset")
        dest = Path(tempfile.gettempdir()) / "wa_explorer" / clip_id / f"{layer}.mp4"
        dest.parent.mkdir(parents=True, exist_ok=True)
        if not dest.exists() or dest.stat().st_mtime < Path(src).stat().st_mtime:
            shutil.copy2(src, dest)
        return str(dest)
    except Exception:
        return hf_hub_url(REPO, rel, repo_type="dataset")


def clip_metadata(clip_id: str) -> str:
    try:
        df = load_clips_df()
        row = df[df["clip_id"] == clip_id].iloc[0].to_dict()
        return json.dumps(row, indent=2, default=str)
    except Exception as e:
        return json.dumps({"clip_id": clip_id, "error": str(e)})


def clip_segments(clip_id: str) -> str:
    try:
        df = load_segments_df()
        video = f"{clip_id}.mp4"
        rows = df[df["video"] == video].sort_values("start_sec")
        lines = []
        for _, r in rows.iterrows():
            lines.append(
                f"**{r['start_sec']:.1f}s–{r['end_sec']:.1f}s** · "
                f"{r.get('action', '?')} **{r.get('object', '?')}**"
            )
        return "\n\n".join(lines[:12]) + ("\n\n_…and more in full pack_" if len(lines) > 12 else "")
    except Exception as e:
        return f"_Could not load segments: {e}_"


def update(clip_label: str, layer: str):
    clip_id = next(c for c, label in CLIPS if label == clip_label)
    try:
        return clip_video_path(clip_id, layer), clip_metadata(clip_id), clip_segments(clip_id)
    except Exception as e:
        return None, json.dumps({"error": str(e)}), f"_Preview failed: {e}_"


with gr.Blocks(title="World Archive Data Explorer", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
# World Archive
### {TAGLINE}

**Mono Clear** — annotated egocentric manipulation from Indian workplaces (factory, kitchen, repair, craft).

| 9 clips | 218 segments | 8+ layers | LeRobot-ready |
|:-------:|:------------:|:---------:|:-------------:|

[Metadata dataset](https://huggingface.co/datasets/{REPO}) · [LeRobot mirror](https://huggingface.co/datasets/{LEROBOT_REPO}) · [Collection](https://huggingface.co/collections/WorldArchive/physical-ai-india)

[Book a call]({CALENDLY}) · shubham@worldarchive.co · [Full pack ~19 GB]({S3}/index.html) · [worldarchive.co](https://worldarchive.co)

```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
ds = LeRobotDataset("{LEROBOT_REPO}")  # 9 episodes · 46k frames
```
"""
    )
    with gr.Row():
        clip_dd = gr.Dropdown([l for _, l in CLIPS], value=CLIPS[0][1], label="Clip")
        layer_dd = gr.Radio(LAYERS, value="plain", label="Layer", info="plain · hand skeleton · object boxes")
    video = gr.Video(label="Preview (6s)", autoplay=True)
    with gr.Row():
        meta = gr.Code(label="Clip metadata", language="json", scale=1)
        segs = gr.Markdown(label="Action segments", scale=1)
    clip_dd.change(update, [clip_dd, layer_dd], [video, meta, segs])
    layer_dd.change(update, [clip_dd, layer_dd], [video, meta, segs])
    demo.load(update, [clip_dd, layer_dd], [video, meta, segs])

if __name__ == "__main__":
    demo.launch()