arc-ai-embodied-intelligence / scripts /setup_huggingface.py
tfayiz's picture
Upload scripts/setup_huggingface.py with huggingface_hub
10d4d9a verified
Raw
History Blame Contribute Delete
8.07 kB
#!/usr/bin/env python3
"""
Setup HuggingFace Hub repositories for ARC-AI project.
Creates:
- arc-ai/embodied-intelligence (model: main project hub)
- arc-ai/diffusion-policy-physics (model: physics-pretrained policy)
- arc-ai/sim-demonstrations (dataset: robot demo data)
Usage:
pip install huggingface_hub
huggingface-cli login
python scripts/setup_huggingface.py
"""
import argparse
import json
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
def create_repos(org: str = "arc-ai", dry_run: bool = False):
from huggingface_hub import HfApi, create_repo
api = HfApi()
repos = [
{
"repo_id": f"{org}/embodied-intelligence",
"repo_type": "model",
"description": "ARC-AI Embodied Intelligence - Physics-grounded manipulation policies",
},
{
"repo_id": f"{org}/diffusion-policy-physics",
"repo_type": "model",
"description": "Diffusion Policy pretrained on THE WELL physics simulations",
},
{
"repo_id": f"{org}/act-policy",
"repo_type": "model",
"description": "Action Chunking Transformer for robotic manipulation",
},
{
"repo_id": f"{org}/sim-demonstrations",
"repo_type": "dataset",
"description": "10K expert demonstrations (7-DOF Franka reach task, MuJoCo)",
},
{
"repo_id": f"{org}/sota-benchmarks",
"repo_type": "dataset",
"description": "SOTA simulation benchmark results (24 adversarial scenarios)",
},
]
for repo in repos:
if dry_run:
logger.info(f"[DRY RUN] Would create: {repo['repo_id']} ({repo['repo_type']})")
continue
try:
create_repo(
repo_id=repo["repo_id"],
repo_type=repo["repo_type"],
exist_ok=True,
private=False,
)
logger.info(f"Created/verified: {repo['repo_id']}")
except Exception as e:
logger.error(f"Failed {repo['repo_id']}: {e}")
return repos
def upload_existing_artifacts(org: str = "arc-ai", sim_dir: str = None):
"""Upload existing trained models and results to HuggingFace."""
from huggingface_hub import HfApi
api = HfApi()
sim_path = Path(sim_dir) if sim_dir else Path(__file__).parent.parent.parent.parent / "sim"
uploads = [
{
"file": sim_path / "diffusion_policy.pt",
"repo": f"{org}/diffusion-policy-physics",
"path_in_repo": "checkpoints/diffusion_policy_500k.pt",
},
{
"file": sim_path / "act_policy.pt",
"repo": f"{org}/act-policy",
"path_in_repo": "checkpoints/act_policy_100k.pt",
},
{
"file": sim_path / "sota_full_results.json",
"repo": f"{org}/sota-benchmarks",
"path_in_repo": "results/sota_full_results.json",
},
{
"file": sim_path / "SOTA_SIMULATION_REPORT.md",
"repo": f"{org}/sota-benchmarks",
"path_in_repo": "REPORT.md",
},
]
for item in uploads:
if not item["file"].exists():
logger.warning(f"Not found: {item['file']}")
continue
try:
api.upload_file(
path_or_fileobj=str(item["file"]),
path_in_repo=item["path_in_repo"],
repo_id=item["repo"],
repo_type="model" if "policy" in item["repo"] or "benchmarks" in item["repo"] else "dataset",
)
logger.info(f"Uploaded: {item['file'].name}{item['repo']}/{item['path_in_repo']}")
except Exception as e:
logger.error(f"Upload failed {item['file'].name}: {e}")
def create_model_card(org: str = "arc-ai"):
"""Generate model card for the main model repo."""
card = f"""---
library_name: pytorch
tags:
- robotics
- diffusion-policy
- embodied-ai
- physics-pretraining
- manipulation
license: apache-2.0
datasets:
- {org}/sim-demonstrations
- polymathic-ai/the_well
---
# ARC-AI: Physics-Grounded Diffusion Policy
Embodied intelligence system for autonomous robotic manipulation,
pretrained on diverse physics simulations from THE WELL.
## Architecture
- **Backbone**: Physics Temporal Encoder (Transformer, 4 layers, 8 heads)
- **Policy**: Denoising Diffusion Probabilistic Model (DDPM)
- **Parameters**: 1.5M (Diffusion) / 2.2M (ACT)
- **Pretraining**: THE WELL (15TB physics simulations)
- **Fine-tuning**: 10K expert demonstrations (MuJoCo, 7-DOF Franka)
## Benchmark Results
| Metric | Value |
|--------|-------|
| GPU Throughput | 8.97M samples/sec (131K parallel envs) |
| Training Speed | 64 steps/sec on A100 |
| Diffusion Loss | 0.046 (500K steps) |
| Adversarial Scenarios | 24 tested |
| Physics Stability | 10M steps, zero failures |
## Usage
```python
from arc_ai.policy import PhysicsDiffusionPolicy, PhysicsPretrainConfig
config = PhysicsPretrainConfig(hidden_dim=256, n_layers=4)
policy = PhysicsDiffusionPolicy(obs_dim=20, action_dim=7, config=config)
policy.load_state_dict(torch.load("checkpoints/diffusion_policy_500k.pt"))
# Inference
obs = torch.randn(1, 4, 20) # (batch, context_frames, obs_dim)
actions = policy.predict(obs, n_inference_steps=10)
# actions shape: (1, 16, 7) — 16-step action chunk
```
## Pretraining on THE WELL
```python
from arc_ai.training import PhysicsPretrainer, PhysicsPretrainConfig
config = PhysicsPretrainConfig(
datasets=["polymathic-ai/gray_scott", "polymathic-ai/rayleigh_benard"],
streaming=True,
num_steps=200000,
)
pretrainer = PhysicsPretrainer(config)
encoder = pretrainer.train()
```
## Citation
```bibtex
@software{{arc_ai_2026,
title={{ARC-AI: Physics-Grounded Embodied Intelligence}},
year={{2026}},
url={{https://huggingface.co/{org}/embodied-intelligence}}
}}
```
"""
return card
def create_dataset_card(org: str = "arc-ai"):
"""Generate dataset card for sim-demonstrations."""
card = f"""---
task_categories:
- robotics
tags:
- manipulation
- mujoco
- franka
- expert-demonstrations
size_categories:
- 1M<n<10M
license: apache-2.0
---
# ARC-AI Simulation Demonstrations
10,000 expert manipulation trajectories for 7-DOF Franka Panda robot.
## Dataset Details
| Field | Value |
|-------|-------|
| Trajectories | 10,000 |
| Steps per trajectory | 300 |
| Total state-action pairs | 2,840,000 |
| Observation dim | 20 (joints + EE + object) |
| Action dim | 7 (joint positions) |
| Physics engine | MuJoCo |
| Expert algorithm | Jacobian IK (damped least-squares) |
| Task | Reach (EE to object position) |
## Observation Space
| Index | Description |
|-------|-------------|
| 0-6 | Joint positions (rad) |
| 7-13 | Joint velocities (rad/s) |
| 14-16 | End-effector XYZ (m) |
| 17-19 | Object XYZ (m) |
## Usage
```python
from datasets import load_dataset
ds = load_dataset("{org}/sim-demonstrations", streaming=True)
for sample in ds["train"]:
obs = sample["observations"] # (300, 20)
actions = sample["actions"] # (300, 7)
```
"""
return card
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
parser = argparse.ArgumentParser(description="Setup HuggingFace repos for ARC-AI")
parser.add_argument("--org", default="arc-ai", help="HuggingFace org/username")
parser.add_argument("--dry-run", action="store_true", help="Print actions without executing")
parser.add_argument("--upload", action="store_true", help="Upload existing artifacts")
parser.add_argument("--sim-dir", default=None, help="Path to sim/ directory")
args = parser.parse_args()
create_repos(org=args.org, dry_run=args.dry_run)
if args.upload and not args.dry_run:
upload_existing_artifacts(org=args.org, sim_dir=args.sim_dir)
logger.info("Done. Run 'huggingface-cli login' first if not authenticated.")