File size: 4,921 Bytes
1e3598e
bd21695
 
 
7a4136f
 
0e7d3e3
7a4136f
bd21695
 
0e7d3e3
7a4136f
bd21695
31abd36
0e7d3e3
1e3598e
 
 
 
 
bd21695
1e3598e
 
 
 
 
 
 
 
 
bd21695
 
0e7d3e3
 
 
 
 
 
 
 
 
 
 
 
bd21695
 
 
 
7a4136f
 
 
 
 
 
 
 
 
bd21695
 
 
0e7d3e3
 
 
 
 
1e3598e
bd21695
0e7d3e3
 
 
 
 
 
 
 
 
1e3598e
 
0e7d3e3
1e3598e
0e7d3e3
 
bd21695
 
 
 
7a4136f
1e3598e
7a4136f
1e3598e
bd21695
 
 
 
0e7d3e3
1e3598e
 
0e7d3e3
1e3598e
0e7d3e3
1e3598e
 
0e7d3e3
3a28087
 
 
0e7d3e3
 
 
1e3598e
0e7d3e3
bd21695
 
 
 
1e3598e
bd21695
0e7d3e3
1e3598e
 
0e7d3e3
 
 
bd21695
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""World Archive Mono sample — layer-switching data explorer (all 9 clips)."""
from __future__ import annotations

import json
import shutil
import tempfile
from functools import lru_cache
from pathlib import Path

import gradio as gr
import pandas as pd
from huggingface_hub import hf_hub_download, hf_hub_url

REPO = "WorldArchive/mono-india-workplace-sample"
LEROBOT_REPO = "WorldArchive/mono-india-workplace-lerobot"
TAGLINE = "Ground truth from the real economy."
CALENDLY = "https://calendly.com/algorithmsdheeraj/30min"
S3 = "https://ggn-egocentric-data-sample.s3.ap-south-1.amazonaws.com/sample_data_june"

# (clip_id, display label)
CLIPS = [
    ("sample_01_shuttle_tube_packaging", "01 · Shuttle packaging"),
    ("sample_02_industrial_sewing_machine", "02 · Industrial sewing"),
    ("sample_03_heatgun_and_batching", "03 · Heat gun & batching"),
    ("sample_04_garment_ironing_and_packing", "04 · Garment ironing"),
    ("sample_05_commercial_catering", "05 · Commercial catering"),
    ("sample_06_cane_weaving", "06 · Cane weaving"),
    ("sample_07_car_detailing", "07 · Car detailing"),
    ("sample_08_primer_and_painting", "08 · Primer & painting"),
    ("sample_09_denting_and_filing", "09 · Denting & filing"),
]
LAYERS = ["plain", "skeleton", "boxes"]


@lru_cache(maxsize=1)
def load_segments_df() -> pd.DataFrame:
    path = hf_hub_download(REPO, "data/segments.parquet", repo_type="dataset")
    return pd.read_parquet(path)


@lru_cache(maxsize=1)
def load_clips_df() -> pd.DataFrame:
    path = hf_hub_download(REPO, "data/clips.parquet", repo_type="dataset")
    return pd.read_parquet(path)


def clip_video_path(clip_id: str, layer: str) -> str:
    rel = f"clips_preview/{clip_id}/{layer}.mp4"
    try:
        src = hf_hub_download(REPO, rel, repo_type="dataset")
        dest = Path(tempfile.gettempdir()) / "wa_explorer" / clip_id / f"{layer}.mp4"
        dest.parent.mkdir(parents=True, exist_ok=True)
        if not dest.exists() or dest.stat().st_mtime < Path(src).stat().st_mtime:
            shutil.copy2(src, dest)
        return str(dest)
    except Exception:
        return hf_hub_url(REPO, rel, repo_type="dataset")


def clip_metadata(clip_id: str) -> str:
    try:
        df = load_clips_df()
        row = df[df["clip_id"] == clip_id].iloc[0].to_dict()
        return json.dumps(row, indent=2, default=str)
    except Exception as e:
        return json.dumps({"clip_id": clip_id, "error": str(e)})


def clip_segments(clip_id: str) -> str:
    try:
        df = load_segments_df()
        video = f"{clip_id}.mp4"
        rows = df[df["video"] == video].sort_values("start_sec")
        lines = []
        for _, r in rows.iterrows():
            lines.append(
                f"**{r['start_sec']:.1f}s–{r['end_sec']:.1f}s** · "
                f"{r.get('action', '?')} **{r.get('object', '?')}**"
            )
        return "\n\n".join(lines[:12]) + ("\n\n_…and more in full pack_" if len(lines) > 12 else "")
    except Exception as e:
        return f"_Could not load segments: {e}_"


def update(clip_label: str, layer: str):
    clip_id = next(c for c, label in CLIPS if label == clip_label)
    try:
        return clip_video_path(clip_id, layer), clip_metadata(clip_id), clip_segments(clip_id)
    except Exception as e:
        return None, json.dumps({"error": str(e)}), f"_Preview failed: {e}_"


with gr.Blocks(title="World Archive Data Explorer", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
# World Archive
### {TAGLINE}

**Mono Clear** — annotated egocentric manipulation from Indian workplaces (factory, kitchen, repair, craft).

| 9 clips | 218 segments | 8+ layers | LeRobot-ready |
|:-------:|:------------:|:---------:|:-------------:|

[Metadata dataset](https://huggingface.co/datasets/{REPO}) · [LeRobot mirror](https://huggingface.co/datasets/{LEROBOT_REPO}) · [Collection](https://huggingface.co/collections/WorldArchive/physical-ai-india)

[Book a call]({CALENDLY}) · shubham@worldarchive.co · [Full pack ~19 GB]({S3}/index.html) · [worldarchive.co](https://worldarchive.co)

```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
ds = LeRobotDataset("{LEROBOT_REPO}")  # 9 episodes · 46k frames
```
"""
    )
    with gr.Row():
        clip_dd = gr.Dropdown([l for _, l in CLIPS], value=CLIPS[0][1], label="Clip")
        layer_dd = gr.Radio(LAYERS, value="plain", label="Layer", info="plain · hand skeleton · object boxes")
    video = gr.Video(label="Preview (6s)", autoplay=True)
    with gr.Row():
        meta = gr.Code(label="Clip metadata", language="json", scale=1)
        segs = gr.Markdown(label="Action segments", scale=1)
    clip_dd.change(update, [clip_dd, layer_dd], [video, meta, segs])
    layer_dd.change(update, [clip_dd, layer_dd], [video, meta, segs])
    demo.load(update, [clip_dd, layer_dd], [video, meta, segs])

if __name__ == "__main__":
    demo.launch()