Upload scripts/setup_huggingface.py with huggingface_hub

10d4d9a verified about 1 month ago

8.07 kB

	#!/usr/bin/env python3
	"""
	Setup HuggingFace Hub repositories for ARC-AI project.

	Creates:
	- arc-ai/embodied-intelligence (model: main project hub)
	- arc-ai/diffusion-policy-physics (model: physics-pretrained policy)
	- arc-ai/sim-demonstrations (dataset: robot demo data)

	Usage:
	pip install huggingface_hub
	huggingface-cli login
	python scripts/setup_huggingface.py
	"""

	import argparse
	import json
	import logging
	from pathlib import Path

	logger = logging.getLogger(__name__)


	def create_repos(org: str = "arc-ai", dry_run: bool = False):
	from huggingface_hub import HfApi, create_repo

	api = HfApi()

	repos = [
	{
	"repo_id": f"{org}/embodied-intelligence",
	"repo_type": "model",
	"description": "ARC-AI Embodied Intelligence - Physics-grounded manipulation policies",
	},
	{
	"repo_id": f"{org}/diffusion-policy-physics",
	"repo_type": "model",
	"description": "Diffusion Policy pretrained on THE WELL physics simulations",
	},
	{
	"repo_id": f"{org}/act-policy",
	"repo_type": "model",
	"description": "Action Chunking Transformer for robotic manipulation",
	},
	{
	"repo_id": f"{org}/sim-demonstrations",
	"repo_type": "dataset",
	"description": "10K expert demonstrations (7-DOF Franka reach task, MuJoCo)",
	},
	{
	"repo_id": f"{org}/sota-benchmarks",
	"repo_type": "dataset",
	"description": "SOTA simulation benchmark results (24 adversarial scenarios)",
	},
	]

	for repo in repos:
	if dry_run:
	logger.info(f"[DRY RUN] Would create: {repo['repo_id']} ({repo['repo_type']})")
	continue

	try:
	create_repo(
	repo_id=repo["repo_id"],
	repo_type=repo["repo_type"],
	exist_ok=True,
	private=False,
	)
	logger.info(f"Created/verified: {repo['repo_id']}")
	except Exception as e:
	logger.error(f"Failed {repo['repo_id']}: {e}")

	return repos


	def upload_existing_artifacts(org: str = "arc-ai", sim_dir: str = None):
	"""Upload existing trained models and results to HuggingFace."""
	from huggingface_hub import HfApi

	api = HfApi()
	sim_path = Path(sim_dir) if sim_dir else Path(__file__).parent.parent.parent.parent / "sim"

	uploads = [
	{
	"file": sim_path / "diffusion_policy.pt",
	"repo": f"{org}/diffusion-policy-physics",
	"path_in_repo": "checkpoints/diffusion_policy_500k.pt",
	},
	{
	"file": sim_path / "act_policy.pt",
	"repo": f"{org}/act-policy",
	"path_in_repo": "checkpoints/act_policy_100k.pt",
	},
	{
	"file": sim_path / "sota_full_results.json",
	"repo": f"{org}/sota-benchmarks",
	"path_in_repo": "results/sota_full_results.json",
	},
	{
	"file": sim_path / "SOTA_SIMULATION_REPORT.md",
	"repo": f"{org}/sota-benchmarks",
	"path_in_repo": "REPORT.md",
	},
	]

	for item in uploads:
	if not item["file"].exists():
	logger.warning(f"Not found: {item['file']}")
	continue
	try:
	api.upload_file(
	path_or_fileobj=str(item["file"]),
	path_in_repo=item["path_in_repo"],
	repo_id=item["repo"],
	repo_type="model" if "policy" in item["repo"] or "benchmarks" in item["repo"] else "dataset",
	)
	logger.info(f"Uploaded: {item['file'].name} → {item['repo']}/{item['path_in_repo']}")
	except Exception as e:
	logger.error(f"Upload failed {item['file'].name}: {e}")


	def create_model_card(org: str = "arc-ai"):
	"""Generate model card for the main model repo."""
	card = f"""---
	library_name: pytorch
	tags:
	- robotics
	- diffusion-policy
	- embodied-ai
	- physics-pretraining
	- manipulation
	license: apache-2.0
	datasets:
	- {org}/sim-demonstrations
	- polymathic-ai/the_well
	---

	# ARC-AI: Physics-Grounded Diffusion Policy

	Embodied intelligence system for autonomous robotic manipulation,
	pretrained on diverse physics simulations from THE WELL.

	## Architecture

	- Backbone: Physics Temporal Encoder (Transformer, 4 layers, 8 heads)
	- Policy: Denoising Diffusion Probabilistic Model (DDPM)
	- Parameters: 1.5M (Diffusion) / 2.2M (ACT)
	- Pretraining: THE WELL (15TB physics simulations)
	- Fine-tuning: 10K expert demonstrations (MuJoCo, 7-DOF Franka)

	## Benchmark Results

	\| Metric \| Value \|
	\|--------\|-------\|
	\| GPU Throughput \| 8.97M samples/sec (131K parallel envs) \|
	\| Training Speed \| 64 steps/sec on A100 \|
	\| Diffusion Loss \| 0.046 (500K steps) \|
	\| Adversarial Scenarios \| 24 tested \|
	\| Physics Stability \| 10M steps, zero failures \|

	## Usage

	```python
	from arc_ai.policy import PhysicsDiffusionPolicy, PhysicsPretrainConfig

	config = PhysicsPretrainConfig(hidden_dim=256, n_layers=4)
	policy = PhysicsDiffusionPolicy(obs_dim=20, action_dim=7, config=config)
	policy.load_state_dict(torch.load("checkpoints/diffusion_policy_500k.pt"))

	# Inference
	obs = torch.randn(1, 4, 20) # (batch, context_frames, obs_dim)
	actions = policy.predict(obs, n_inference_steps=10)
	# actions shape: (1, 16, 7) — 16-step action chunk
	```

	## Pretraining on THE WELL

	```python
	from arc_ai.training import PhysicsPretrainer, PhysicsPretrainConfig

	config = PhysicsPretrainConfig(
	datasets=["polymathic-ai/gray_scott", "polymathic-ai/rayleigh_benard"],
	streaming=True,
	num_steps=200000,
	)
	pretrainer = PhysicsPretrainer(config)
	encoder = pretrainer.train()
	```

	## Citation

	```bibtex
	@software{{arc_ai_2026,
	title={{ARC-AI: Physics-Grounded Embodied Intelligence}},
	year={{2026}},
	url={{https://huggingface.co/{org}/embodied-intelligence}}
	}}
	```
	"""
	return card


	def create_dataset_card(org: str = "arc-ai"):
	"""Generate dataset card for sim-demonstrations."""
	card = f"""---
	task_categories:
	- robotics
	tags:
	- manipulation
	- mujoco
	- franka
	- expert-demonstrations
	size_categories:
	- 1M<n<10M
	license: apache-2.0
	---

	# ARC-AI Simulation Demonstrations

	10,000 expert manipulation trajectories for 7-DOF Franka Panda robot.

	## Dataset Details

	\| Field \| Value \|
	\|-------\|-------\|
	\| Trajectories \| 10,000 \|
	\| Steps per trajectory \| 300 \|
	\| Total state-action pairs \| 2,840,000 \|
	\| Observation dim \| 20 (joints + EE + object) \|
	\| Action dim \| 7 (joint positions) \|
	\| Physics engine \| MuJoCo \|
	\| Expert algorithm \| Jacobian IK (damped least-squares) \|
	\| Task \| Reach (EE to object position) \|

	## Observation Space

	\| Index \| Description \|
	\|-------\|-------------\|
	\| 0-6 \| Joint positions (rad) \|
	\| 7-13 \| Joint velocities (rad/s) \|
	\| 14-16 \| End-effector XYZ (m) \|
	\| 17-19 \| Object XYZ (m) \|

	## Usage

	```python
	from datasets import load_dataset

	ds = load_dataset("{org}/sim-demonstrations", streaming=True)
	for sample in ds["train"]:
	obs = sample["observations"] # (300, 20)
	actions = sample["actions"] # (300, 7)
	```
	"""
	return card


	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

	parser = argparse.ArgumentParser(description="Setup HuggingFace repos for ARC-AI")
	parser.add_argument("--org", default="arc-ai", help="HuggingFace org/username")
	parser.add_argument("--dry-run", action="store_true", help="Print actions without executing")
	parser.add_argument("--upload", action="store_true", help="Upload existing artifacts")
	parser.add_argument("--sim-dir", default=None, help="Path to sim/ directory")
	args = parser.parse_args()

	create_repos(org=args.org, dry_run=args.dry_run)

	if args.upload and not args.dry_run:
	upload_existing_artifacts(org=args.org, sim_dir=args.sim_dir)

	logger.info("Done. Run 'huggingface-cli login' first if not authenticated.")