Spaces:

nvidia
/

Cosmos3-Action-Viewer

Running

App Files Files Community

XinKongCosmos commited on about 4 hours ago

Commit

3264a6d

verified ·

1 Parent(s): 80ca707

Trim unused viewer support code

Browse files

Files changed (10) hide show

cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/__init__.py +0 -0
cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/augmentor.py +0 -52
cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py +86 -146
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py +0 -3
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py +2 -88
cosmos-framework/cosmos_framework/data/vfm/action/viewpoint_utils.py +1 -105
cosmos-framework/cosmos_framework/data/vfm/action_scripts/__init__.py +0 -0
cosmos-framework/cosmos_framework/data/vfm/action_scripts/memprofile.py +0 -254
cosmos-framework/cosmos_framework/data/vfm/augmentors/__init__.py +0 -0
cosmos-framework/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py +0 -10

cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/__init__.py DELETED Viewed

File without changes

cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/augmentor.py DELETED Viewed

@@ -1,52 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-from collections.abc import Iterable
-from typing import Any, Generator, Optional
-class Augmentor:
-    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
-        r"""Base augmentor class
-        Args:
-            input_keys (list): List of input keys
-            output_keys (list): List of output keys
-            args (dict): Arguments associated with the augmentation
-        """
-        self.input_keys = input_keys
-        self.output_keys = output_keys
-        self.args = args
-    def __call__(self, *args: Any, **kwds: Any) -> Any:
-        raise ValueError("Augmentor not implemented")
-class IterableAugmentor:
-    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
-        r"""Base augmentor class
-        Args:
-            input_keys (list): List of input keys
-            output_keys (list): List of output keys
-            args (dict): Arguments associated with the augmentation
-        """
-        self.input_keys = input_keys
-        self.output_keys = output_keys
-        self.args = args
-        self.is_generator = True
-    def __call__(self, data: Iterable) -> Generator:
-        r"""Example usage:
-        for data_dict in data:
-            # Do something to data_dict
-            data_dict["input"] = data_dict["raw_sequence"][:, :-1]
-            data_dict["target"] = data_dict["raw_sequence"][:, 1:]
-            # Skip sample if needed
-            if data_dict["input"].shape[1] < 64:
-                continue
-            # Construct a generator
-            yield data_dict
-        """
-        raise ValueError("Augmentor not implemented")

cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py CHANGED Viewed

@@ -71,19 +71,6 @@ from cosmos_framework.data.vfm.action.action_spec import (  # noqa: F401  (re-ex
 from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
 from cosmos_framework.data.vfm.action.pose_utils import compute_idle_frames
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
-from cosmos_framework.data.vfm.action_scripts.memprofile import (
-    deep_size as _deep_size,
-)
-from cosmos_framework.data.vfm.action_scripts.memprofile import (
-    fmt_mb as _fmt_mb,
-)
-from cosmos_framework.data.vfm.action_scripts.memprofile import (
-    log_worker_memory_breakdown,
-    rss_tracker,
-)
-from cosmos_framework.data.vfm.action_scripts.memprofile import (
-    memprofile_enabled as _memprofile_enabled,
-)
 # ---------------------------------------------------------------------------
 # LRU-capped VideoDecoderCache
@@ -305,69 +292,47 @@ class BaseActionLeRobotDataset(Dataset):
         super().__init__()
         _ensure_hf_hub_offline()
         _patch_decoder_cache()
-        self._memprofile = _memprofile_enabled()
         assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
         assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
-        with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
-            self._fps = fps
-            self._dt = 1.0 / fps
-            self._chunk_length = chunk_length
-            self._split_seed = split_seed
-            self._split_val_ratio = split_val_ratio
-            self._split = _normalize_split(split)
-            self._mode = mode
-            self._embodiment_type = embodiment_type
-            self._viewpoint: Viewpoint = viewpoint
-            self._pose_convention = pose_convention
-            self._rotation_format = rotation_format
-            self._tolerance_s = tolerance_s
-            self._max_loaded_datasets = max_loaded_datasets
-            self._skip_video_loading = skip_video_loading
-            self._sample_stride = sample_stride
-            self._enable_fast_init = enable_fast_init
-            self._fast_init_max_workers = fast_init_max_workers
-            self._delta_timestamps: dict[str, list[float]] = {}
-            self._to_opencv: np.ndarray | dict[str, np.ndarray] = np.eye(3, dtype=np.float32)
-            if pose_convention is None:
-                log.warning(
-                    f"{self.__class__.__name__}: pose_convention is not set. "
-                    "Consider specifying 'backward_framewise' or 'backward_anchored'."
-                )
-            self._datasets: list[LeRobotDataset | None] = []
-            self._dataset_build_args: list[dict[str, Any] | None] = []
-            self._loaded_lru: OrderedDict[int, None] = OrderedDict()
-            # -- Flat index structures (populated by _append_index_records) --
-            # Together these two lists form a searchable map from a flat
-            # global index to (dataset, row, episode, frame).  One entry per
-            # episode span across *all* registered sources.
-            #
-            # _episode_records[i] = (ds_idx, sample_start, valid_len, episode_id)
-            #   ds_idx       – which source dataset (index into _datasets)
-            #   sample_start – first row of this span in that dataset's table
-            #   valid_len    – number of usable frames in this span
-            #   episode_id   – the episode this span belongs to
-            #
-            # _episode_cum_ends[i] = running total of valid_len through span i
-            #   Used for O(log N) lookup via bisect_right in _resolve_index.
-            self._episode_records: list[tuple[int, int, int, int]] = []
-            self._episode_cum_ends: list[int] = []
-            self._num_valid_indices = 0
-            self._domain_id = get_domain_id(self._embodiment_type)
-            # Deferred-init shard roots — a list of root paths.
-            # Subclasses populate this in __init__; _register_sources()
-            # reads _delta_timestamps and _tolerance_s from self (both
-            # initialised above, with _delta_timestamps overridden by
-            # each subclass).
-            # ActionUnifiedIterableDataset.assign_worker uses len() for
-            # round-robin shard distribution and _register_sources(indices)
-            # for deferred loading.  When empty, shard distribution is
-            # skipped (every worker iterates the full dataset).
-            self._all_shard_roots: list[str] = []
     # -- public properties ---------------------------------------------------
@@ -428,42 +393,30 @@ class BaseActionLeRobotDataset(Dataset):
         if repo_id == "local" and revision is None:
             revision = "local"
-        with rss_tracker(f"{cls}{label_str} — metadata load", enabled=self._memprofile):
-            if prefetched_meta is not None:
-                meta = prefetched_meta
-            else:
-                meta = LeRobotDatasetMetadata(
-                    repo_id=repo_id,
-                    root=root,
-                    revision=revision,
-                    force_cache_sync=force_cache_sync,
-                )
-            ds_idx = len(self._datasets)
-            self._datasets.append(None)
-            self._dataset_build_args.append(
-                {
-                    "repo_id": repo_id,
-                    "root": root,
-                    "delta_timestamps": delta_timestamps,
-                    "tolerance_s": tolerance_s,
-                    "force_cache_sync": force_cache_sync,
-                    "download_videos": download_videos,
-                    "video_backend": video_backend,
-                    "revision": revision,
-                }
             )
-        with rss_tracker(
-            f"{cls}{label_str} — index records",
-            enabled=self._memprofile,
-            extras_fn=lambda: [
-                f"episode_records so far: {len(self._episode_records)} entries, "
-                f"~{_fmt_mb(_deep_size(self._episode_records) / (1024 * 1024))}",
-                f"episode_cum_ends so far: {len(self._episode_cum_ends)} entries, "
-                f"~{_fmt_mb(_deep_size(self._episode_cum_ends) / (1024 * 1024))}",
-            ],
-        ):
-            self._append_index_records(meta=meta, ds_idx=ds_idx, dataset_label=dataset_label)
         return meta
@@ -584,35 +537,30 @@ class BaseActionLeRobotDataset(Dataset):
             evict_idx, _ = self._loaded_lru.popitem(last=False)
             self._datasets[evict_idx] = None
-        with rss_tracker(
-            f"[WORKER {_os.getpid()}] Lazy-loaded ds[{ds_idx}]",
-            enabled=self._memprofile,
-            extras_fn=lambda: [f"total loaded={len(self._loaded_lru)}/{len(self._datasets)}"],
-        ):
-            delta_ts = build_args["delta_timestamps"]
-            if self._skip_video_loading:
-                # Covers both LeRobot v2 (``observation.images.<name>``) and
-                # v3 (``observation.image.<name>``) video-column conventions.
-                delta_ts = {k: v for k, v in delta_ts.items() if not k.startswith("observation.image")}
-            log.info(f"Loading shard root={build_args['root']}")
-            ds = LeRobotDataset(
-                repo_id=build_args["repo_id"],
-                root=build_args["root"],
-                delta_timestamps=delta_ts,
-                tolerance_s=build_args["tolerance_s"],
-                force_cache_sync=build_args["force_cache_sync"],
-                download_videos=build_args["download_videos"],
-                video_backend=build_args["video_backend"],
-                revision=build_args["revision"],
-                episodes=None,
-            )
-            if self._skip_video_loading:
-                ds.meta.info["features"] = {
-                    k: v for k, v in ds.meta.info["features"].items() if v.get("dtype") != "video"
-                }
-            self._datasets[ds_idx] = ds
-            self._loaded_lru[ds_idx] = None
         return ds
@@ -688,15 +636,7 @@ class BaseActionLeRobotDataset(Dataset):
         mode = self._choose_mode()
         dataset_idx, row_idx, _, _ = self._resolve_index(idx)
-        self._getitem_count = getattr(self, "_getitem_count", 0) + 1
-        profile = self._memprofile and self._getitem_count % 50 == 1
-        with rss_tracker(
-            f"[WORKER {_os.getpid()}] __getitem__ transient (dataset_idx={dataset_idx})",
-            enabled=profile,
-            after_fn=lambda: log_worker_memory_breakdown(self),
-        ):
-            sample = self._get_dataset(dataset_idx)[row_idx]
         if self._skip_video_loading:
             sample = defaultdict(lambda: None, sample)

 from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
 from cosmos_framework.data.vfm.action.pose_utils import compute_idle_frames
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
 # ---------------------------------------------------------------------------
 # LRU-capped VideoDecoderCache
         super().__init__()
         _ensure_hf_hub_offline()
         _patch_decoder_cache()
         assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
         assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
+        self._fps = fps
+        self._dt = 1.0 / fps
+        self._chunk_length = chunk_length
+        self._split_seed = split_seed
+        self._split_val_ratio = split_val_ratio
+        self._split = _normalize_split(split)
+        self._mode = mode
+        self._embodiment_type = embodiment_type
+        self._viewpoint: Viewpoint = viewpoint
+        self._pose_convention = pose_convention
+        self._rotation_format = rotation_format
+        self._tolerance_s = tolerance_s
+        self._max_loaded_datasets = max_loaded_datasets
+        self._skip_video_loading = skip_video_loading
+        self._sample_stride = sample_stride
+        self._enable_fast_init = enable_fast_init
+        self._fast_init_max_workers = fast_init_max_workers
+        self._delta_timestamps: dict[str, list[float]] = {}
+        self._to_opencv: np.ndarray | dict[str, np.ndarray] = np.eye(3, dtype=np.float32)
+        if pose_convention is None:
+            log.warning(
+                f"{self.__class__.__name__}: pose_convention is not set. "
+                "Consider specifying 'backward_framewise' or 'backward_anchored'."
+            )
+        self._datasets: list[LeRobotDataset | None] = []
+        self._dataset_build_args: list[dict[str, Any] | None] = []
+        self._loaded_lru: OrderedDict[int, None] = OrderedDict()
+        # -- Flat index structures (populated by _append_index_records) --
+        # Together these two lists form a searchable map from a flat
+        # global index to (dataset, row, episode, frame). One entry per
+        # episode span across all registered sources.
+        self._episode_records: list[tuple[int, int, int, int]] = []
+        self._episode_cum_ends: list[int] = []
+        self._num_valid_indices = 0
+        self._domain_id = get_domain_id(self._embodiment_type)
+        self._all_shard_roots: list[str] = []
     # -- public properties ---------------------------------------------------
         if repo_id == "local" and revision is None:
             revision = "local"
+        if prefetched_meta is not None:
+            meta = prefetched_meta
+        else:
+            meta = LeRobotDatasetMetadata(
+                repo_id=repo_id,
+                root=root,
+                revision=revision,
+                force_cache_sync=force_cache_sync,
             )
+        ds_idx = len(self._datasets)
+        self._datasets.append(None)
+        self._dataset_build_args.append(
+            {
+                "repo_id": repo_id,
+                "root": root,
+                "delta_timestamps": delta_timestamps,
+                "tolerance_s": tolerance_s,
+                "force_cache_sync": force_cache_sync,
+                "download_videos": download_videos,
+                "video_backend": video_backend,
+                "revision": revision,
+            }
+        )
+        self._append_index_records(meta=meta, ds_idx=ds_idx, dataset_label=dataset_label)
         return meta
             evict_idx, _ = self._loaded_lru.popitem(last=False)
             self._datasets[evict_idx] = None
+        delta_ts = build_args["delta_timestamps"]
+        if self._skip_video_loading:
+            # Covers both LeRobot v2 (``observation.images.<name>``) and
+            # v3 (``observation.image.<name>``) video-column conventions.
+            delta_ts = {k: v for k, v in delta_ts.items() if not k.startswith("observation.image")}
+        log.info(f"Loading shard root={build_args['root']}")
+        ds = LeRobotDataset(
+            repo_id=build_args["repo_id"],
+            root=build_args["root"],
+            delta_timestamps=delta_ts,
+            tolerance_s=build_args["tolerance_s"],
+            force_cache_sync=build_args["force_cache_sync"],
+            download_videos=build_args["download_videos"],
+            video_backend=build_args["video_backend"],
+            revision=build_args["revision"],
+            episodes=None,
+        )
+        if self._skip_video_loading:
+            ds.meta.info["features"] = {
+                k: v for k, v in ds.meta.info["features"].items() if v.get("dtype") != "video"
+            }
+        self._datasets[ds_idx] = ds
+        self._loaded_lru[ds_idx] = None
         return ds
         mode = self._choose_mode()
         dataset_idx, row_idx, _, _ = self._resolve_index(idx)
+        sample = self._get_dataset(dataset_idx)[row_idx]
         if self._skip_video_loading:
             sample = defaultdict(lambda: None, sample)

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py CHANGED Viewed

@@ -16,9 +16,6 @@ from cosmos_framework.data.vfm.action.fractal import FractalLeRobotDataset
 from cosmos_framework.data.vfm.action.robomind_franka_dataset import RoboMINDFrankaDataset
 from cosmos_framework.data.vfm.action.umi_lerobot_dataset import UMIFastLeRobotDataset
-_DEFAULT_LUSTRE_DATASET_ROOT = "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets"
 @dataclass
 class LazyCall:
     """Tiny LazyCall replacement sufficient for the standalone viewer."""

 from cosmos_framework.data.vfm.action.robomind_franka_dataset import RoboMINDFrankaDataset
 from cosmos_framework.data.vfm.action.umi_lerobot_dataset import UMIFastLeRobotDataset
 @dataclass
 class LazyCall:
     """Tiny LazyCall replacement sufficient for the standalone viewer."""

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py CHANGED Viewed

@@ -21,12 +21,12 @@ from __future__ import annotations
 import argparse
 import importlib
 import os
 import random
 import sys
 import time as _time
 from dataclasses import dataclass, field
-from functools import lru_cache
 from pathlib import Path
 from typing import Any, cast
@@ -279,92 +279,6 @@ def _format_sample_text(value: Any, max_chars: int | None = None) -> str:
     return text[:max_chars]
-def _build_viewer_idle_action_spec(action_format: ActionFormat) -> Any:
-    """Build a fallback idle-frame spec from the viewer-declared action format."""
-    from cosmos_framework.data.vfm.action.action_spec import Gripper, Pos, Rot, build_action_spec
-    if action_format is ActionFormat.EGO_9D:
-        return build_action_spec(Pos(prefix="ego"), Rot("rot6d", prefix="ego"))
-    if action_format is ActionFormat.SINGLE_ARM_10D:
-        return build_action_spec(Pos(), Rot("rot6d"), Gripper())
-    if action_format is ActionFormat.DUAL_ARM_20D:
-        return build_action_spec(
-            Pos(prefix="left"),
-            Rot("rot6d", prefix="left"),
-            Gripper(prefix="left"),
-            Pos(prefix="right"),
-            Rot("rot6d", prefix="right"),
-            Gripper(prefix="right"),
-        )
-    raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
-def _compute_viewer_idle_frames(
-    action: Any,
-    dataset: Any,
-    action_format: ActionFormat,
-) -> torch.Tensor | None:
-    """Compute idle frames for a viewer sample when the dataset did not provide them."""
-    action_spec = getattr(dataset, "action_spec", None)
-    compute_idle_frames_method = getattr(dataset, "_compute_idle_frames", None)
-    if action_spec is not None and compute_idle_frames_method is not None:
-        return compute_idle_frames_method(action)
-    from cosmos_framework.data.vfm.action.pose_utils import compute_idle_frames
-    spec = _build_viewer_idle_action_spec(action_format)
-    try:
-        idle_frames = compute_idle_frames(action, spec)
-    except (TypeError, ValueError) as error:
-        log.warning(f"Viewer idle-frame detection skipped for {action_format.value}: {error}")
-        return None
-    return torch.tensor(idle_frames, dtype=torch.long)  # []
-@lru_cache(maxsize=1)
-def _get_viewer_idle_frames_augmentor() -> Any:
-    """Return the caption augmentor used by the viewer idle-frame path."""
-    from cosmos_framework.data.vfm.augmentors.idle_frames_text_info import IdleFramesTextInfo
-    return IdleFramesTextInfo(
-        input_keys=["ai_caption", "idle_frames", "action"],
-        output_keys=["ai_caption"],
-        args={
-            "caption_key": "ai_caption",
-            "idle_frames_key": "idle_frames",
-            "action_key": "action",
-            "dropout_rate": 0.0,
-            "enabled": True,
-        },
-    )
-def _enable_viewer_idle_frames(sample: dict[str, Any], dataset: Any, action_format: ActionFormat) -> dict[str, Any]:
-    """Populate idle-frame metadata and append text in the direct viewer data path."""
-    updated_sample = sample
-    idle_frames = updated_sample.get("idle_frames")
-    action = updated_sample.get("action")
-    if idle_frames is None and action is not None:
-        idle_frames = _compute_viewer_idle_frames(action, dataset, action_format)
-        if idle_frames is not None:
-            updated_sample = dict(updated_sample)
-            updated_sample["idle_frames"] = idle_frames
-    if idle_frames is None:
-        return updated_sample
-    updated_sample = dict(updated_sample)
-    caption = updated_sample.get("ai_caption")
-    if isinstance(caption, dict):
-        updated_sample["ai_caption"] = dict(caption)
-    augmented_sample = _get_viewer_idle_frames_augmentor()(updated_sample)
-    return updated_sample if augmented_sample is None else augmented_sample
 class _IterableToMapDataset:
     """Wraps an IterableDataset into a random-access dataset with lazy loading."""
@@ -687,7 +601,7 @@ def launch_viewer(
                             ep_idx = n_total - 1
                             ep_input.value = ep_idx
-                    sample: Any = _enable_viewer_idle_frames(dataset[ep_idx], dataset, effective_action_format)
                 action_tensor = sample["action"]
                 action_raw = (

 import argparse
 import importlib
+from functools import lru_cache
 import os
 import random
 import sys
 import time as _time
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, cast
     return text[:max_chars]
 class _IterableToMapDataset:
     """Wraps an IterableDataset into a random-access dataset with lazy loading."""
                             ep_idx = n_total - 1
                             ep_input.value = ep_idx
+                    sample: Any = dataset[ep_idx]
                 action_tensor = sample["action"]
                 action_raw = (

cosmos-framework/cosmos_framework/data/vfm/action/viewpoint_utils.py CHANGED Viewed

@@ -1,114 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-"""Viewpoint type definitions and caption augmentor for Action datasets.
-Provides a ``Viewpoint`` type alias for camera perspective labels and a
-``ViewpointTextInfo`` augmentor that appends a human-readable viewpoint
-description to the caption string.
-"""
 from __future__ import annotations
 from typing import Literal
-from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
-from cosmos_framework.utils import log
 Viewpoint = Literal["ego_view", "third_person_view", "wrist_view", "concat_view"]
-DEFAULT_VIEWPOINT_TEMPLATES: dict[str, str] = {
-    "ego_view": "This video is captured from a first-person perspective looking at the scene.",
-    "third_person_view": "This video is captured from a third-person perspective looking towards the agent from the front.",
-    "wrist_view": "This video is captured from a wrist-mounted camera.",
-    "concat_view": "This video contains concatenated views from multiple camera perspectives.",
-}
-class ViewpointTextInfo(Augmentor):
-    """Augmentor that appends viewpoint type description to captions.
-    Reads a viewpoint label from ``data_dict[viewpoint_key]`` and appends
-    the corresponding template sentence to the caption.  Designed to run
-    after the raw ``ai_caption`` is set but before duration/FPS metadata
-    is appended.
-    Args:
-        input_keys: Input keys (kept for API compatibility).
-        output_keys: Output keys (kept for API compatibility).
-        args: Configuration arguments:
-            - caption_key (str): Key for caption in data_dict. Default: ``"ai_caption"``
-            - viewpoint_key (str): Key for viewpoint label. Default: ``"viewpoint"``
-            - templates (dict): Override mapping from viewpoint to sentence.
-              Default: :data:`DEFAULT_VIEWPOINT_TEMPLATES`
-            - separator (str): Separator between caption and metadata. Default: ``". "``
-            - enabled (bool): Whether augmentation is enabled. Default: ``True``
-    """
-    def __init__(
-        self,
-        input_keys: list | None = None,
-        output_keys: list | None = None,
-        args: dict | None = None,
-    ) -> None:
-        super().__init__(input_keys or [], output_keys or [], args)
-        self.caption_key: str = args.get("caption_key", "ai_caption") if args else "ai_caption"
-        self.viewpoint_key: str = args.get("viewpoint_key", "viewpoint") if args else "viewpoint"
-        self.templates: dict[str, str] = (
-            args.get("templates", DEFAULT_VIEWPOINT_TEMPLATES) if args else DEFAULT_VIEWPOINT_TEMPLATES
-        )
-        self.default_separator: str = args.get("separator", ". ") if args else ". "
-        self.enabled: bool = args.get("enabled", True) if args else True
-    def __call__(self, data_dict: dict) -> dict | None:
-        """Append viewpoint description to the caption.
-        If the sample provides an ``"additional_view_description"`` key (a
-        free-form string describing the concatenated camera layout), it is
-        appended after the generic ``concat_view`` template. This allows each
-        dataset to supply its own description of which cameras are tiled and
-        how.
-        Args:
-            data_dict: Sample dictionary containing caption and viewpoint.
-        Returns:
-            The mutated *data_dict*, or the original unchanged if the
-            viewpoint key is missing or unrecognized.
-        """
-        if not self.enabled:
-            return data_dict
-        viewpoint = data_dict.get(self.viewpoint_key)
-        if viewpoint is None:
-            raise ValueError(
-                f"ViewpointTextInfo: missing key {self.viewpoint_key!r} in data_dict. "
-                f"All action datasets must provide a viewpoint label."
-            )
-        # Append dataset-specific concat_view details after the base template.
-        additional_view_description = data_dict.pop("additional_view_description", None)
-        template = self.templates.get(viewpoint)
-        if template is None:
-            log.warning(
-                f"ViewpointTextInfo: unrecognized viewpoint {viewpoint!r}. "
-                f"Known viewpoints: {sorted(self.templates.keys())}. Skipping.",
-                rank0_only=False,
-            )
-            return data_dict
-        if additional_view_description:
-            separator = " " if template.endswith(".") else self.default_separator
-            template = template + separator + additional_view_description.rstrip()
-        caption = data_dict.get(self.caption_key)
-        if not isinstance(caption, str) or caption == "":
-            return data_dict
-        caption = caption.rstrip()
-        separator = " " if caption.endswith(".") else self.default_separator
-        data_dict[self.caption_key] = caption + separator + template
-        return data_dict

 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+"""Viewpoint type definitions for release action datasets."""
 from __future__ import annotations
 from typing import Literal
 Viewpoint = Literal["ego_view", "third_person_view", "wrist_view", "concat_view"]

cosmos-framework/cosmos_framework/data/vfm/action_scripts/__init__.py DELETED Viewed

File without changes

cosmos-framework/cosmos_framework/data/vfm/action_scripts/memprofile.py DELETED Viewed

@@ -1,254 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-"""Lightweight CPU memory-profiling helpers.
-Only depends on ``os``, ``sys``, and ``psutil`` so it can be imported safely
-from dataset modules without pulling in heavy dependencies.
-Enable per-stage logging by setting the ``MEMORY_PROFILE`` env var::
-    MEMORY_PROFILE=1 torchrun ...
-"""
-import contextlib
-import gc
-import logging
-import os
-import sys
-from collections.abc import Callable, Iterator
-import psutil
-_log = logging.getLogger(__name__)
-def memprofile_enabled() -> bool:
-    """Return ``True`` when the ``MEMORY_PROFILE`` env var is truthy."""
-    return os.environ.get("MEMORY_PROFILE", "").strip() not in ("", "0", "false")
-def fmt_mb(mb: float) -> str:
-    """Format a MiB value as a human-readable string (MiB or GiB)."""
-    if mb >= 1024:
-        return f"{mb / 1024:.2f} GiB"
-    return f"{mb:.1f} MiB"
-@contextlib.contextmanager
-def rss_tracker(
-    label: str,
-    *,
-    enabled: bool | None = None,
-    extras_fn: Callable[[], list[str]] | None = None,
-    after_fn: Callable[[], None] | None = None,
-) -> Iterator[None]:
-    """Track RSS delta across a block.  No-op when profiling is disabled.
-    When *enabled* is ``False`` (or ``None`` and ``MEMORY_PROFILE`` is unset)
-    the context manager yields immediately with zero overhead -- no
-    ``gc.collect()`` and no ``psutil`` calls.
-    Args:
-        label: Human-readable description included in the log line.
-        enabled: Explicit toggle.  When ``None``, falls back to
-            ``memprofile_enabled()`` (i.e. the ``MEMORY_PROFILE`` env var).
-        extras_fn: Optional callback invoked *after* the measured block.
-            Each returned string is logged as a supplementary detail line.
-        after_fn: Optional side-effect callback invoked after logging.
-            Use for actions that should only run when profiling is active
-            (e.g. detailed worker memory breakdowns).
-    """
-    if enabled is None:
-        enabled = memprofile_enabled()
-    if not enabled:
-        yield
-        return
-    gc.collect()
-    rss_before = get_rss_mb()
-    yield
-    gc.collect()
-    rss_after = get_rss_mb()
-    _log.debug(
-        "[MEMPROFILE] %s | RSS: %s (delta: +%s)",
-        label,
-        fmt_mb(rss_after),
-        fmt_mb(rss_after - rss_before),
-    )
-    if extras_fn is not None:
-        for line in extras_fn():
-            _log.debug("[MEMPROFILE]   %s", line)
-    if after_fn is not None:
-        after_fn()
-def get_rss_mb() -> float:
-    """Return the current process RSS in MiB."""
-    return psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
-def get_process_tree_rss_mb() -> float:
-    """Return RSS of the current process + all children in MiB."""
-    proc = psutil.Process(os.getpid())
-    total = proc.memory_info().rss
-    for child in proc.children(recursive=True):
-        try:
-            total += child.memory_info().rss
-        except (psutil.NoSuchProcess, psutil.AccessDenied):
-            pass
-    return total / (1024 * 1024)
-def get_worker_memory_breakdown() -> list[tuple[int, float]]:
-    """Return a list of ``(pid, rss_mib)`` for each child process."""
-    proc = psutil.Process(os.getpid())
-    result: list[tuple[int, float]] = []
-    for child in proc.children(recursive=True):
-        try:
-            rss_mb = child.memory_info().rss / (1024 * 1024)
-            result.append((child.pid, rss_mb))
-        except (psutil.NoSuchProcess, psutil.AccessDenied):
-            pass
-    return result
-def get_worker_memory_detailed() -> list[dict[str, float]]:
-    """Return RSS, USS (Unique Set Size), and PSS for each child process.
-    USS is the memory *unique* to a process -- not shared with any other.
-    It directly measures CoW-duplicated pages plus worker-only allocations.
-    PSS counts shared pages proportionally (shared_page / num_sharers).
-    Returns list of dicts with keys: ``pid``, ``rss``, ``uss``, ``pss`` (all in MiB).
-    Falls back to RSS-only if ``memory_full_info()`` is unavailable.
-    """
-    proc = psutil.Process(os.getpid())
-    result: list[dict[str, float]] = []
-    for child in proc.children(recursive=True):
-        try:
-            full = child.memory_full_info()
-            result.append(
-                {
-                    "pid": float(child.pid),
-                    "rss": full.rss / (1024 * 1024),
-                    "uss": full.uss / (1024 * 1024),
-                    "pss": full.pss / (1024 * 1024),
-                }
-            )
-        except (psutil.NoSuchProcess, psutil.AccessDenied, AttributeError):
-            try:
-                rss_mb = child.memory_info().rss / (1024 * 1024)
-                result.append(
-                    {
-                        "pid": float(child.pid),
-                        "rss": rss_mb,
-                        "uss": -1.0,
-                        "pss": -1.0,
-                    }
-                )
-            except (psutil.NoSuchProcess, psutil.AccessDenied):
-                pass
-    return result
-def get_uss_mb() -> float:
-    """Return USS (Unique Set Size) of the current process in MiB.
-    Falls back to RSS if ``memory_full_info()`` is unavailable.
-    """
-    proc = psutil.Process(os.getpid())
-    try:
-        return proc.memory_full_info().uss / (1024 * 1024)
-    except (AttributeError, psutil.AccessDenied):
-        return proc.memory_info().rss / (1024 * 1024)
-def log_worker_memory_breakdown(dataset: object) -> None:
-    """Log a detailed memory breakdown from inside a dataloader worker.
-    Designed to be called periodically from ``__getitem__`` when
-    ``MEMORY_PROFILE=1``.  Inspects the dataset's internal state to
-    report how many ``LeRobotDataset`` instances are loaded, HuggingFace
-    Arrow table sizes, and the LeRobot ``VideoDecoderCache`` size.
-    Args:
-        dataset: A ``BaseActionLeRobotDataset`` instance (or compatible).
-    """
-    import gc
-    import logging
-    pid = os.getpid()
-    rss = get_rss_mb()
-    uss = get_uss_mb()
-    logger = logging.getLogger(f"memprofile.worker.{pid}")
-    logger.warning(f"[WORKER {pid}] RSS={fmt_mb(rss)} USS={fmt_mb(uss)}")
-    # --- LeRobotDataset instances ---
-    datasets_list = getattr(dataset, "_datasets", [])
-    loaded_count = sum(1 for ds in datasets_list if ds is not None)
-    total_count = len(datasets_list)
-    logger.warning(f"[WORKER {pid}]   LeRobotDataset: {loaded_count}/{total_count} loaded")
-    total_arrow_bytes = 0
-    total_hf_rows = 0
-    for i, ds in enumerate(datasets_list):
-        if ds is None:
-            continue
-        hf_ds = getattr(ds, "hf_dataset", None)
-        if hf_ds is None:
-            logger.warning(f"[WORKER {pid}]     ds[{i}]: hf_dataset not yet loaded")
-            continue
-        num_rows = len(hf_ds)
-        total_hf_rows += num_rows
-        arrow_bytes = 0
-        data_table = getattr(hf_ds, "_data", None)
-        if data_table is not None and hasattr(data_table, "nbytes"):
-            arrow_bytes = data_table.nbytes
-            total_arrow_bytes += arrow_bytes
-        logger.warning(f"[WORKER {pid}]     ds[{i}]: rows={num_rows}, arrow={fmt_mb(arrow_bytes / (1024 * 1024))}")
-    if loaded_count > 0:
-        logger.warning(
-            f"[WORKER {pid}]   Total HF rows={total_hf_rows}, total arrow={fmt_mb(total_arrow_bytes / (1024 * 1024))}"
-        )
-    # --- VideoDecoderCache ---
-    try:
-        from lerobot.datasets.video_utils import _default_decoder_cache
-        cache_size = _default_decoder_cache.size()
-        logger.warning(f"[WORKER {pid}]   VideoDecoderCache entries: {cache_size}")
-    except Exception:
-        pass
-    # --- GC stats ---
-    gc_counts = gc.get_count()
-    all_objects = len(gc.get_objects())
-    logger.warning(f"[WORKER {pid}]   GC counts={gc_counts}, tracked objects={all_objects}")
-def deep_size(obj: object, seen: set | None = None) -> int:
-    """Approximate deep memory size in bytes for nested Python containers.
-    Recursively walks ``dict``, ``list``, ``tuple``, ``set``, and ``frozenset``.
-    Does **not** follow arbitrary object attributes.
-    """
-    if seen is None:
-        seen = set()
-    obj_id = id(obj)
-    if obj_id in seen:
-        return 0
-    seen.add(obj_id)
-    size = sys.getsizeof(obj)
-    if isinstance(obj, dict):
-        for k, v in obj.items():
-            size += deep_size(k, seen) + deep_size(v, seen)
-    elif isinstance(obj, (list, tuple, set, frozenset)):
-        for item in obj:
-            size += deep_size(item, seen)
-    return size

cosmos-framework/cosmos_framework/data/vfm/augmentors/__init__.py DELETED Viewed

File without changes

cosmos-framework/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py DELETED Viewed

@@ -1,10 +0,0 @@
-class IdleFramesTextInfo:
-    """Minimal standalone replacement for viewer caption augmentation."""
-    def __init__(self, input_keys=None, output_keys=None, args=None):
-        self.input_keys = input_keys or []
-        self.output_keys = output_keys or []
-        self.args = args or {}
-    def __call__(self, sample, *args, **kwargs):
-        return sample