oasis-500m / oasis_wrapper.py

Update oasis_wrapper.py

c4274b4 verified 22 days ago

14 kB

	"""
	Oasis 500M — sai_wm third-party wrapper.

	Loaded via trust_remote_code=True:

	wm = AutoWorldModel.from_pretrained(
	"your-org/oasis-minecraft",
	trust_remote_code=True,
	device="cuda:0",
	)

	The src/ directory (dit.py, vae.py, utils/) is included alongside
	this file in the HF repo. Weights are downloaded from Etched/oasis-500m.
	"""

	import json
	import logging
	import os
	import sys

	import numpy as np
	import torch
	import torch.nn as nn
	from einops import rearrange
	from huggingface_hub import hf_hub_download

	logger = logging.getLogger(__name__)

	# Number of action keys (matches open-oasis generate.py)
	NUM_ACTION_KEYS = 25


	def sigmoid_beta_schedule(timesteps, start=-3, end=3, tau=1, clamp_min=1e-5):
	"""Sigmoid noise schedule — from open-oasis utils.py."""
	steps = timesteps + 1
	t = torch.linspace(0, timesteps, steps, dtype=torch.float64) / timesteps
	v_start = torch.tensor(start / tau).sigmoid()
	v_end = torch.tensor(end / tau).sigmoid()
	alphas_cumprod = (
	-((t * (end - start) + start) / tau).sigmoid() + v_end
	) / (v_end - v_start)
	alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
	betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
	return torch.clip(betas, 0, 0.999)


	def _ensure_src_importable():
	"""Add the src/ directory next to this file to sys.path."""
	this_dir = os.path.dirname(os.path.abspath(__file__))
	src_dir = os.path.join(this_dir, "src")
	if os.path.isdir(src_dir) and src_dir not in sys.path:
	# We need the parent of src/ on sys.path so 'from src.dit import ...' works
	# But since dit.py uses relative imports (from .utils...), we treat src/ as a package
	parent = this_dir
	if parent not in sys.path:
	sys.path.insert(0, parent)


	class OasisWorldModel:
	"""
	Oasis 500M world model — sai_wm third-party wrapper.

	Loads DiT backbone + ViT-VAE from the bundled src/ package,
	downloads weights from Etched/oasis-500m, wraps the diffusion
	sampling loop (matching generate.py) into forward/predict.
	"""

	def __init__(
	self,
	world_config: dict,
	np_random=None,
	device: str = "cpu",
	ddim_steps: int = 10,
	noise_abs_max: float = 20.0,
	):
	self.device = device
	self.np_random = np_random or np.random.default_rng()

	# ── Load config ───────────────────────────────────────
	repo_id = world_config.get("repo_id", "")
	model_file = world_config.get("model_file", "")

	cache_dir = os.path.expanduser("~/.cache/sai/world_models")
	os.makedirs(cache_dir, exist_ok=True)

	config_filename = f"{model_file}/config.json" if model_file else "config.json"
	config_path = hf_hub_download(
	repo_id=repo_id, filename=config_filename,
	local_dir=cache_dir,
	)
	with open(config_path) as f:
	config = json.load(f)

	metadata = config.get("metadata", {})
	self.ddim_steps = ddim_steps or metadata.get("ddim_steps", 10)
	self.max_noise_level = metadata.get("max_noise_level", 1000)
	self.stabilization_level = metadata.get("stabilization_level", 15)
	self.scaling_factor = metadata.get("scaling_factor", 0.07843137255)
	self.noise_abs_max = noise_abs_max
	self.max_frames = metadata.get("max_frames", 32)

	# Build world_spec
	self.world_spec = type("WorldModelSpec", (), {
	"name": config.get("name", "oasis-500m"),
	"env": config.get("env", "Minecraft"),
	"model_type": "oasis",
	"metadata": metadata,
	"validate": lambda self: None,
	})()

	# ── Import model definitions from src/ ────────────────
	_ensure_src_importable()
	from src.dit import DiT
	from src.vae import VAE_models

	# ── Download and load weights ─────────────────────────
	weight_repo = metadata.get("weight_repo", "Etched/oasis-500m")
	dit_file = metadata.get("dit_file", "oasis500m.pt")
	vae_file = metadata.get("vae_file", "vit-l-20.pt")

	weight_dir = os.path.join(cache_dir, "oasis_weights")
	os.makedirs(weight_dir, exist_ok=True)

	dit_path = hf_hub_download(
	repo_id=weight_repo, filename=dit_file,
	local_dir=weight_dir,
	)
	vae_path = hf_hub_download(
	repo_id=weight_repo, filename=vae_file,
	local_dir=weight_dir,
	)

	# ── Load DiT (matching generate.py: DiT_models["DiT-S/2"]) ─
	logger.info("Loading Oasis DiT from %s", dit_path)
	self.dit = DiT(
	input_h=18, input_w=32, patch_size=2,
	in_channels=16, hidden_size=1024, depth=16,
	num_heads=16, mlp_ratio=4.0,
	external_cond_dim=NUM_ACTION_KEYS,
	max_frames=self.max_frames,
	)
	ckpt = torch.load(dit_path, map_location=torch.device(device), weights_only=True)
	self.dit.load_state_dict(ckpt, strict=False)
	self.dit = self.dit.to(device).eval()

	# ── Load VAE ──────────────────────────────────────────
	logger.info("Loading Oasis ViT-VAE from %s", vae_path)
	self.vae = VAE_models["vit-l-20-shallow-encoder"]()
	vae_ckpt = torch.load(vae_path, map_location=torch.device(device), weights_only=True)
	self.vae.load_state_dict(vae_ckpt)
	self.vae = self.vae.to(device).eval()

	# ── Precompute noise schedule ─────────────────────────
	betas = sigmoid_beta_schedule(self.max_noise_level).float().to(device)
	alphas = 1.0 - betas
	self.alphas_cumprod = torch.cumprod(alphas, dim=0)
	self.alphas_cumprod = rearrange(self.alphas_cumprod, "T -> T 1 1 1")

	# Noise range (matching generate.py)
	self.noise_range = torch.linspace(
	-1, self.max_noise_level - 1, self.ddim_steps + 1,
	)

	# ── State buffers ─────────────────────────────────────
	self._latent_buffer = None # (1, T, C, H, W)
	self._action_buffer = None # (1, T, num_action_keys)
	self._frame_idx = 0

	logger.info("Oasis 500M loaded on %s", device)

	# ── sai_wm interface ──────────────────────────────────────

	def reset(self, seed=None):
	if seed is not None:
	torch.manual_seed(seed)
	self.np_random = np.random.default_rng(seed)
	self._latent_buffer = None
	self._action_buffer = None
	self._frame_idx = 0

	def forward(self, obs: np.ndarray) -> dict:
	"""
	Encode initial frame(s).

	Parameters
	----------
	obs : np.ndarray
	RGB image, CHW or HWC, [0,1] or [0,255].
	"""
	img = self._to_tensor(obs) # (1, C, H, W)

	with torch.no_grad():
	with torch.autocast(self.device, dtype=torch.float16):
	z = self.vae.encode(img * 2 - 1).mean * self.scaling_factor

	# z: (1, seq_h*seq_w, latent_dim) → (1, C, H, W)
	ph = self.vae.seq_h
	pw = self.vae.seq_w
	z = rearrange(z, "b (h w) c -> b c h w", h=ph, w=pw)

	# Init buffers
	self._latent_buffer = z.unsqueeze(1) # (1, 1, C, H, W)
	# Initial "no-op" action
	self._action_buffer = torch.zeros(
	1, 1, NUM_ACTION_KEYS, device=self.device,
	)
	self._frame_idx = 1

	recon = self._decode(z)

	return {
	"latent_state": z.squeeze(0).cpu().numpy(),
	"recon": recon,
	}

	def predict(self, action) -> dict:
	"""
	Generate next frame. Sampling loop matches generate.py exactly.

	Parameters
	----------
	action : int or np.ndarray
	If int: index into the 25 action keys (sets that key to 1).
	If np.ndarray of shape (25,): raw one-hot/continuous action vector.
	"""
	if self._latent_buffer is None:
	raise RuntimeError("Call forward() first.")

	# ── Prepare action ────────────────────────────────────
	act = self._encode_action(action) # (1, 1, 25)
	self._action_buffer = torch.cat(
	[self._action_buffer, act], dim=1,
	)

	# ── Append noise chunk ────────────────────────────────
	B = 1
	chunk = torch.randn(
	(B, 1, *self._latent_buffer.shape[-3:]), device=self.device,
	)
	chunk = chunk.clamp(-self.noise_abs_max, self.noise_abs_max)
	x = torch.cat([self._latent_buffer, chunk], dim=1)

	i = self._frame_idx # current frame index (0-based)
	start_frame = max(0, i + 1 - self.max_frames)

	# ── Diffusion denoising loop (from generate.py) ───────
	for noise_idx in reversed(range(1, self.ddim_steps + 1)):
	# Noise levels: context frames get stabilization_level, last frame gets actual noise
	t_ctx = torch.full(
	(B, i), self.stabilization_level - 1,
	dtype=torch.long, device=self.device,
	)
	t = torch.full(
	(B, 1), int(self.noise_range[noise_idx].item()),
	dtype=torch.long, device=self.device,
	)
	t_next = torch.full(
	(B, 1), int(self.noise_range[noise_idx - 1].item()),
	dtype=torch.long, device=self.device,
	)
	t_next = torch.where(t_next < 0, t, t_next)

	t_full = torch.cat([t_ctx, t], dim=1)
	t_next_full = torch.cat([t_ctx, t_next], dim=1)

	# Sliding window
	x_curr = x.clone()[:, start_frame:]
	t_slice = t_full[:, start_frame:]
	t_next_slice = t_next_full[:, start_frame:]
	actions_slice = self._action_buffer[:, start_frame:i + 1]

	# DiT forward
	with torch.no_grad():
	with torch.autocast(self.device, dtype=torch.float16):
	v = self.dit(x_curr, t_slice, external_cond=actions_slice)

	# v-prediction → x_start, x_noise (matching generate.py)
	x_start = (
	self.alphas_cumprod[t_slice].sqrt() * x_curr
	- (1 - self.alphas_cumprod[t_slice]).sqrt() * v
	)
	x_noise = (
	(1 / self.alphas_cumprod[t_slice]).sqrt() * x_curr - x_start
	) / (1 / self.alphas_cumprod[t_slice] - 1).sqrt()

	# Frame prediction
	alpha_next = self.alphas_cumprod[t_next_slice]
	alpha_next[:, :-1] = torch.ones_like(alpha_next[:, :-1])
	if noise_idx == 1:
	alpha_next[:, -1:] = torch.ones_like(alpha_next[:, -1:])

	x_pred = alpha_next.sqrt() * x_start + x_noise * (1 - alpha_next).sqrt()
	x[:, -1:] = x_pred[:, -1:]

	# ── Update state ──────────────────────────────────────
	new_latent = x[:, -1:]
	self._latent_buffer = x # keep full buffer (includes new frame)

	# Trim to max context
	if self._latent_buffer.shape[1] > self.max_frames:
	trim = self._latent_buffer.shape[1] - self.max_frames
	self._latent_buffer = self._latent_buffer[:, trim:]
	self._action_buffer = self._action_buffer[:, trim:]

	self._frame_idx += 1

	recon = self._decode(new_latent.squeeze(1))

	return {
	"latent_state": new_latent.squeeze(0).squeeze(0).cpu().numpy(),
	"recon": recon,
	"reward": None,
	"terminated": False,
	}

	# ── Helpers ────────────────────────────────────────────────

	def _encode_action(self, action) -> torch.Tensor:
	"""Convert action to (1, 1, 25) tensor."""
	if isinstance(action, np.ndarray) and action.shape == (NUM_ACTION_KEYS,):
	return torch.from_numpy(action).float().reshape(1, 1, -1).to(self.device)
	elif isinstance(action, (int, np.integer)):
	act = torch.zeros(1, 1, NUM_ACTION_KEYS, device=self.device)
	act[0, 0, int(action)] = 1.0
	return act
	elif isinstance(action, torch.Tensor):
	return action.float().reshape(1, 1, -1).to(self.device)
	else:
	raise ValueError(
	f"Action must be int (action key index), np.ndarray(25,), "
	f"or torch.Tensor. Got {type(action)}."
	)

	def _to_tensor(self, obs: np.ndarray) -> torch.Tensor:
	"""Convert obs to (1, C, H, W) float [0,1]."""
	img = np.asarray(obs, dtype=np.float32)
	if img.ndim == 3 and img.shape[-1] in (1, 3, 4):
	img = np.transpose(img, (2, 0, 1))
	if img.max() > 1.0:
	img = img / 255.0
	return torch.from_numpy(img).unsqueeze(0).to(self.device)

	def _decode(self, z: torch.Tensor) -> np.ndarray:
	"""Decode latent (1, C, H, W) → RGB (C, H, W) in [0,1]."""
	with torch.no_grad():
	with torch.autocast(self.device, dtype=torch.float16):
	z_flat = rearrange(z, "b c h w -> b (h w) c")
	decoded = self.vae.decode(z_flat / self.scaling_factor)
	decoded = (decoded + 1) / 2
	return decoded.squeeze(0).clamp(0, 1).float().cpu().numpy()