Lexa commited on May 21, 2025

Commit

b5a0bec

1 Parent(s): bb10ea5

Converted .pt files to safetensors, then (dirtily) patched fairseq to enable loading of safetensor files

Files changed (33) hide show

.gitattributes +1 -0
.gitignore +4 -1
Patches/Patch_TorchLoader.py +87 -0
_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/{metadata.pt → metadata.safetensors} +2 -2
_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/{model.pt → model.safetensors} +2 -2
_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/model_card.yaml +1 -1
_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/{rank_0.pt → rank_0.safetensors} +2 -2
lcm/datasets/base.py +114 -0
lcm/datasets/configs.py +774 -0
lcm/datasets/dataloader.py +258 -0
lcm/datasets/dataloading.py +1109 -0
lcm/datasets/parquet_utils.py +1141 -0
lcm/datasets/sentence_splitter_pipeline.py +351 -0
lcm/datasets/sentence_splitting.py +160 -0
lcm/datasets/utils.py +42 -0
lcm/models/two_tower_diffusion_lcm/loader.py +3 -1
lcm/train/__main__.py +131 -0
lcm/train/common.py +65 -0
lcm/train/criterion.py +100 -0
lcm/train/lcm/__init__.py +4 -0
lcm/train/lcm/criterion.py +143 -0
lcm/train/lcm/trainer.py +259 -0
lcm/train/metrics.py +449 -0
lcm/train/mse_lcm/__init__.py +4 -0
lcm/train/mse_lcm/criterion.py +179 -0
lcm/train/optim.py +96 -0
lcm/train/step_sampler.py +107 -0
lcm/train/trainer.py +1422 -0
lcm/train/two_tower_diffusion_lcm/__init__.py +4 -0
lcm/train/two_tower_diffusion_lcm/criterion.py +404 -0
lcm/train/two_tower_diffusion_lcm/trainer.py +47 -0
pyproject.toml +1 -0
scripts/CovertToST.py +33 -0

.gitattributes CHANGED Viewed

	@@ -1 +1,2 @@
1	*.pt filter=lfs diff=lfs merge=lfs -text


1	*.pt filter=lfs diff=lfs merge=lfs -text
2	+ *.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -115,4 +115,7 @@ mortimer_env.txt
 _LexaLCM_Block0/Datasets/
 # UV
-uv.lock

 _LexaLCM_Block0/Datasets/
 # UV
+uv.lock
+# Unsafe files
+*.pt

Patches/Patch_TorchLoader.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Patch for fairseq2.utils.file.load_tensors
+#
+# This patch allows for loading safetensors files
+#
+# It is used in the two_tower_diffusion_lcm model loader:
+#    ./lcm/models/two_tower_diffusion_lcm/loader.py
+from __future__ import annotations
+import warnings
+from pathlib import Path
+from typing import Any, Callable, Dict, Mapping, Optional, Protocol, Union
+from warnings import catch_warnings
+import torch
+from torch import Tensor
+from typing_extensions import TypeAlias
+from fairseq2.typing import Device
+from safetensors.torch import load_file
+MapLocation: TypeAlias = Optional[
+    Union[Callable[[Tensor, str], Tensor], Device, str, Dict[str, str]]
+]
+class TensorLoader(Protocol):
+    """Loads tensors from files."""
+    def __call__(
+        self,
+        path: Path,
+        *,
+        map_location: MapLocation = None,
+        restrict: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        :param path:
+            The path to the file.
+        :param map_location:
+            Same as the ``map_location`` parametload_two_tower_diffusion_lcm_model = StandardModelLoader(  # type: ignore # FIXME
+    config_loader=load_two_tower_diffusion_lcm_config,
+    factory=create_two_tower_diffusion_lcm_model,
+    checkpoint_converter=convert_lcm_checkpoint,
+    restrict_checkpoints=False,
+)
+        """
+class TensorDumper(Protocol):
+    """Dumps tensors to files."""
+    def __call__(self, data: Mapping[str, Any], path: Path) -> None:
+        """
+        :param data:
+            The dictionary containing tensors and other auxiliary data.
+        :param path:
+            The path to the file.
+        """
+def load_tensors(
+    path: Path,
+    *,
+    map_location=None,
+    restrict: bool = False,
+) -> Dict[str, Any]:
+    """Load a checkpoint in .pt or .safetensors format."""
+    if str(path).endswith(".safetensors"):
+        tensors = load_file(str(path), device=str(map_location) if map_location else "cpu")
+        return {"model": tensors}  # ✅ Wrap it like a .pt file
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        return torch.load(
+            str(path), map_location, weights_only=restrict  # type: ignore[arg-type]
+        )
+def dump_tensors(data: Mapping[str, Any], path: Path) -> None:
+    """Dump ``data`` to a PyTorch tensor file under ``path``."""
+    with catch_warnings():
+        warnings.simplefilter("ignore")  # Suppress noisy FSDP warnings.
+        torch.save(data, path)

_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/{metadata.pt → metadata.safetensors} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72a183d6a5d90ff8ae2bd4ceaab9cc107d20c53f4e4d37f1152fbc27b356a5b4
-size 5284

 version https://git-lfs.github.com/spec/v1
+oid sha256:9bbcbf73561f6bc5d0a17ea6a2081feed2d1304e87602d8c502d9a5c4bd85576
+size 16

_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/{model.pt → model.safetensors} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c587394ef0a4ab818d9e023974d351d70852a2f02847efdbd13ef327a4c6ac33
-size 575893434

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f6160840e8a76276b126f4da6ded5568c2dcc777fd40007ccfa5bcfb08d9bce
+size 575804960

_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/model_card.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 __source__: inproc
-checkpoint: file:///home/lexa/DevProjects/_Unsorted/LexaLCM_Pre0_288M/_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/model.pt
 model_arch: arch_lexa_lcm_pre0
 model_family: two_tower_diffusion_lcm
 name: on_the_fly_lcm

 __source__: inproc
+checkpoint: file:///home/lexa/DevProjects/_Unsorted/LexaLCM_Pre0_288M/_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/model.safetensors
 model_arch: arch_lexa_lcm_pre0
 model_family: two_tower_diffusion_lcm
 name: on_the_fly_lcm

_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000/{rank_0.pt → rank_0.safetensors} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:766a467589456f9d9e060dc79d5837c8e7f0f9dd8572997cae32c97d66eb74cb
-size 2307681830

 version https://git-lfs.github.com/spec/v1
+oid sha256:9bbcbf73561f6bc5d0a17ea6a2081feed2d1304e87602d8c502d9a5c4bd85576
+size 16

lcm/datasets/base.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+import logging
+from abc import ABC, abstractmethod
+from typing import Callable, Dict, Generic, Iterator, Optional, Sequence, TypeVar, Union
+import torch
+from fairseq2.data.data_pipeline import DataPipeline
+from fairseq2.gang import FakeGang, Gang
+from fairseq2.typing import DataType
+from lcm.datasets.configs import (
+    DataLoadingConfig,
+    DatasetConfigT,
+    create_dataset_config_from_cards,
+)
+from lcm.datasets.dataloading import (
+    build_weighted_pipeline_with_renaming as default_build_fn,
+)
+from lcm.utils.common import Batched, set_mkl_num_threads
+BatchT_co = TypeVar("BatchT_co", bound=Union[Dict, Batched], covariant=True)
+logger = logging.getLogger(__name__)
+class DataLoader(ABC, Generic[BatchT_co, DatasetConfigT]):
+    def __init__(
+        self,
+        data_config: DataLoadingConfig,
+        datasets: Sequence[DatasetConfigT],
+        gang: Gang,
+        builder_func: Callable[..., DataPipeline] = default_build_fn,
+        dtype: DataType = torch.float16,
+    ):
+        self.data_config = data_config
+        self.datasets = list(map(create_dataset_config_from_cards, datasets))
+        self.dtype = dtype
+        self.gang = gang
+        self.builder_func = builder_func
+        self._pipeline: Optional[DataPipeline] = None
+    @property
+    def pipeline(self) -> DataPipeline:
+        if self._pipeline is None:
+            logger.info(f"R{self.gang.rank} self._pipeline is None, building...")
+            gang_rank = self.gang.rank if self.gang else 0
+            world_size = self.gang.size if self.gang else 1
+            self._pipeline = self.builder_func(
+                self.datasets, self.data_config, gang_rank, world_size
+            )
+        assert self._pipeline, (
+            f"Cannot build data pipeline from config {self.data_config}"
+        )
+        return self._pipeline
+    def destroy(self) -> None:
+        """Destroy the pipeline to rebuild it with different shuffling"""
+        self._pipeline = None
+        # Build again and reset it
+        logger.info(f"R{self.gang.rank} resetting the pipeline in DataLoader.destroy")
+        self.reset()
+    def reset(self) -> None:
+        """
+        Applying reset will result in different shuffling for next iterations,
+        since pipeline will use modified generator state from previous one.
+        This's suitable side effect for `sharding_in_memory=False` (training) scenario.
+        Illustrative example :
+        >>> import torch
+        >>> from fairseq2.data import read_sequence
+        >>> def get_one_epoch_pipeline():
+        ...     torch.manual_seed(13)
+        ...     return read_sequence(list(range(10))).shuffle(5)
+        >>> bb = get_one_epoch_pipeline().and_return()
+        >>> list(bb)
+        [3, 1, 2, 4, 0, 8, 5, 6, 9, 7]
+        >>> bb.reset()
+        >>> list(bb)
+        [4, 0, 3, 2, 1, 9, 7, 6, 8, 5]
+        """
+        self.pipeline.reset()
+    @abstractmethod
+    def iterate_batches(self) -> Iterator[BatchT_co]: ...
+class BaseDataLoader(DataLoader[dict, DatasetConfigT]):
+    def __init__(
+        self,
+        data_config: DataLoadingConfig,
+        datasets: Sequence[DatasetConfigT],
+        dtype: DataType = torch.float16,
+        gang: Gang = None,
+    ) -> None:
+        gang = gang or FakeGang()
+        super().__init__(
+            data_config=data_config,
+            datasets=datasets,
+            builder_func=default_build_fn,
+            dtype=dtype,
+            gang=gang,
+        )
+        set_mkl_num_threads()
+    def iterate_batches(self) -> Iterator[dict]:
+        yield from iter(self.pipeline)

lcm/datasets/configs.py ADDED Viewed

	@@ -0,0 +1,774 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+import logging
+import re
+from dataclasses import asdict, dataclass, fields
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, TypeVar
+# XXX: these should be kept for eval of filters expressions
+import pyarrow as pa
+import pyarrow.compute as pc
+import pyarrow.parquet as pq
+from fairseq2.assets import default_asset_store
+from omegaconf import MISSING
+logger = logging.getLogger(__name__)
+class ParquetBatchFormat(Enum):
+    pyarrow = 0
+    pandas = 1
+    torch = 2
+class ColumnsNames(Enum):
+    source_column = "_source_column"
+    source_text_column = "_source_text_column"
+    target_column = "_target_column"
+    target_text_column = "_target_text_column"
+    dataset_name = "_dataset_name"
+@dataclass
+class SonarTextColumn:
+    text_value: Optional[str] = None
+    """
+    Raw text expression that will be used as constant colum after being sententized and sonarized.
+    """
+    text_column: Optional[str] = None
+    sonar_column: Optional[str] = None
+    """
+    Note `text_column` and `sonar_column` should be aligned (so `sonar_column` should be sonar encoded  `text_column`).
+    If `sonar_column` is None and `text_column` is provided, we set `sonar_column = f"{text_column}_sonar_emb"` as default processing value!
+    """
+@dataclass
+class ParquetDatasetLimitOptions:
+    fraction_of_files: Optional[float] = None
+    nb_files: Optional[int] = None
+    nb_fragments: Optional[int] = None
+    nb_rows: Optional[int] = None
+@dataclass(frozen=True)
+class SonarDecoderConfig:
+    tokenizer: str = "text_sonar_basic_decoder"
+    """ SONAR tokenizer """
+    decoder: str = "text_sonar_basic_decoder"
+    """ SONAR decoder"""
+    lang: str = "eng_Latn"
+    """ Target language """
+    max_tokens_in_sentence: int = 256
+    """Maximum number of tokens generated in the text"""
+    temperature: float = 1.0
+    """The decoding logit temperature, where values greater than 1.0 produce more
+        uniform logits; values less than 1.0 produce sharper logits."""
+@dataclass(frozen=True)
+class SonarEncoderConfig:
+    tokenizer: str = "text_sonar_basic_encoder"
+    """ SONAR tokenizer """
+    encoder: str = "text_sonar_basic_encoder"
+    """ SONAR decoder"""
+    lang: str = "eng_Latn"
+    """ Target language """
+@dataclass
+class DatasetConfig:
+    """
+    Generic dataset config
+    """
+    columns: Optional[List[str]] = None
+    """The list of columns to load.
+    Columns such as `source_column`, ..., will be added automatically.
+    """
+    source_text_column: Optional[str] = None
+    """ Column to load as source raw text"""
+    target_text_column: Optional[str] = None
+    """ Column to load as target raw text for paired data"""
+    source_prefix_text: Optional[str] = None
+    """ Text to prepend to the content of the source_column"""
+    source_suffix_text: Optional[str] = None
+    """ Text to append to the content of the target_column"""
+    target_prefix_text: Optional[str] = None
+    """ Text to prepend to the content of the source_column"""
+    target_suffix_text: Optional[str] = None
+    """ Text to append to the content of the target_column"""
+    source_sequences: Optional[List[SonarTextColumn]] = None
+    """
+    Designed to make on-the-fly prompts from existing columns that are more complex than prefix and suffix.
+    Each element of source_sequences  is a SonarTextColumn, which can be either:
+    - constant raw text (with the text_value argument)
+    - text column (with the text_column argument)
+    - sonar column (with the sonar_column argument)
+    Note that text_value cannot co-exist with text_column or sonar_column, and sonar column cannot be specified
+    without a text column. Further behaviour for parquet datasets:
+    - If text_value is specified, this  will be split to sentences and sonarized
+    - If only text_column is specified, a new column named "<text_column>_sonar_emb" will be added as sonar_column.
+    - If both (text_column, sonar_column) is specified,
+    All SonarTextColumn elements from source_sequences will be concatenated together to produce new source_column
+    and source_text_column (same for target), which will have names as defined in ColumnsNames.
+    Using source_sequences is NOT compatible with using source_column or source_text_column, as well as quality filtering.
+    """
+    target_sequences: Optional[List[SonarTextColumn]] = None
+    """Designed to make on-the-fly prompts / instructions for target column, see `source_sequences` for more details"""
+    silent_freeze: bool = False
+    """If set to true, the config value can only be set once, i.e. it will not be able to update after the being set is instantiated.
+    This is helpful to avoid side-effect in setting some configs after being specified by the user application (Hydra, CLI)"""
+    def __post_init__(self):
+        if self.source_sequences is not None:
+            if self.source_text_column is not None:
+                logger.warning(
+                    f"Both `source_sequence` and `source_text_column` is specified. "
+                    f"Ignore `source_text_column` and use default value `{ColumnsNames.source_text_column.value}`.\n"
+                    f"(`source_sequences` = {self.source_sequences}, \n"
+                    f"`source_text_column` = {self.source_text_column} )"
+                )
+            self.source_text_column = ColumnsNames.source_text_column.value
+        if self.target_sequences is not None:
+            if self.target_text_column is not None:
+                logger.warning(
+                    f"Both `target_sequences` and `target_text_column` is specified. "
+                    f"Ignore `target_text_column` and use default value `{ColumnsNames.target_text_column.value}`.\n"
+                    f"(`target_sequences` = {self.target_sequences}, \n"
+                    f"`target_text_column` = {self.target_text_column} )"
+                )
+            self.target_text_column = ColumnsNames.target_text_column.value
+        for col in (self.source_sequences or []) + (self.target_sequences or []):
+            if col.text_value is not None:
+                assert col.text_column is None and col.sonar_column is None
+            else:
+                assert col.text_column is not None
+        self._has_initialized_: bool = True
+    def __setattr__(self, name: str, value: Any) -> None:
+        if not getattr(self, "_has_initialized_", False):
+            return super().__setattr__(name, value)
+        if name == "silent_freeze":
+            raise ValueError(
+                "Direct change of silent_freeze outside __init__ is forbidden"
+            )
+        if self.silent_freeze and getattr(self, name) not in ("", None, MISSING):
+            logger.debug(
+                f"Ignore change of {name} since silent_freeze is set and value is not empty ({getattr(self, name)})"
+            )
+            return
+        super().__setattr__(name, value)
+    def override_attr(self, name: str, value: Any) -> None:
+        try:
+            self._has_initialized_ = False
+            super().__setattr__(name, value)
+        finally:
+            self._has_initialized_ = True
+    def freeze(self) -> None:
+        """Turn the `silent_freeze` flag on"""
+        try:
+            self._has_initialized_ = False
+            self.silent_freeze = True
+        finally:
+            self._has_initialized_ = True
+@dataclass
+class JSONDatasetConfig(DatasetConfig):
+    """Config for datasets stored in JsonL format."""
+    file_path: str = str()
+    """
+    Path to the directory containing the Jsonl dataset.
+    Each task will replace this wil a real Json files
+    TODO: Add support for remote JsonL file (e.g. with "s3://...")
+    """
+    prompt_template: Optional[str] = None
+    """
+    A jinja-format string to apply for each item in the dataset to transform into a string.
+    Useful for example when compiling a dynamic instruction / prompt for training or evaluation.
+    Note that when this is specified, it will take precedence over the "affix" option, i.e. the
+    columns `source_prefix_text`, `source_suffix_text`,... will be ignored.
+    """
+    def __setattr__(self, name: str, value: Any) -> None:
+        if not getattr(self, "_has_initialized_", False):
+            return super().__setattr__(name, value)
+        if name == "silent_freeze":
+            raise ValueError("Direct change of silent_freeze is forbidden")
+        if self.silent_freeze:
+            if getattr(self, name) not in ("", None, MISSING):
+                logger.debug(
+                    f"Ignore change of {name} in silent frozen mode when value is not empty ({getattr(self, name)})"
+                )
+                return
+            # Ensure we cannot set the default `prompt_template` value when the user specifies
+            # source_sequences or source_text_column explicitly
+            for hi_prior_col, lo_prior_col, lo_prior_value in [
+                ("source_sequences", "source_text_column", self.source_text_column),
+                ("target_sequences", "target_text_column", self.target_text_column),
+                ("prompt_template", "source_sequences", self.source_sequences),
+                ("prompt_template", "source_prefix_text", self.source_prefix_text),
+                ("prompt_template", "source_suffix_text", self.source_suffix_text),
+            ]:
+                if name == hi_prior_col and lo_prior_value not in ("", None, MISSING):
+                    logger.warning(
+                        f"Updating value of {hi_prior_col} will cause conflicts with the user-defined "
+                        f"value in {lo_prior_col}. The update will be ignored.\n"
+                    )
+                    return
+        super().__setattr__(name, value)
+@dataclass
+class ParquetDatasetConfig(DatasetConfig):
+    """
+    Config for datasets stored in Parquet format.
+    XXX: this config should not hold non-trival default values.
+    We want this to make datacards info and hydra config merge easier.
+    All None value should be filled up in downstream `build_parquet_iterator_pipeline`.
+    """
+    name: Optional[str] = None
+    """When name is provided, it will use preregistered cards to populate all attributes.
+        name convention is the following
+        -  {card_name}={split}:{weight}
+       Example:
+        - wiki
+        - wiki:0.2 # no split
+        - wiki=dev  # default weight=1
+        - wiki=dev:0.2
+       Cards attributes will be overwritten by user defined ParquetDatasetConfig in
+            `create_dataset_config_from_cards`.
+    """
+    parquet_path: str = str()
+    """The path to parquet dataset file.
+        if `parquet_path` is remote (like stats with "s3://..."),
+        the filesystem will be automatically detected and `filesystem_expr` should remain None
+    """
+    weight: float = 1.0
+    """
+    Indicates relative weight of dataset that can be used for sampling from different datasets.
+    """
+    limit: Optional[ParquetDatasetLimitOptions] = None
+    """
+    Contains different options that allows to load only a part of the provided dataset.
+    It will **always** take some number of **first** fragments according to the order in which
+    they appear in the dataset and this logic will not be depedent on suffling/seed.
+    When several limits are provided, each of them will be applied (resulting in the strongest limit).
+    """
+    source_column: Optional[str] = None
+    """ Column to load as source embeddings"""
+    target_column: Optional[str] = None
+    """ Column to load as target embeddings for paired data"""
+    source_quality_column: Optional[str] = None
+    source_quality_range: Optional[Any] = None
+    partition_filters: Optional[str] = None
+    """
+    Filters that should be applied only on partition columns for fast partition prunning.
+    This filters should not be duplicated in `filters` (below) which are used on materialized data.
+    To know the partition columns on dataset :
+    ```python
+    >>> pq.ParquetDataset(parquet_path).partitioning.schema.names
+    ```
+    Note that for if `parquet_path` references a single file -> the result above will NOT be correct (returns all columns).
+    Note that for a single file case, there should no partition_filters since there're no partitions !!
+    """
+    filters: Optional[str] = None
+    """See https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression
+    Some examples :
+    >>> import pyarrow.compute as pc
+    >>> import pyarrow as pa
+    >>> filters = (pc.field("data_split") == pc.scalar("train")) & (pc.field("duration") > 7)
+    >>> filters = pa.compute.greater(pa.compute.utf8_length(ds.field("lang1_text")), 4)
+    >>> filters = pa.compute.less_equal(pa.compute.list_value_length(pa.dataset.field("audio_wav")), 16_000 * 30)
+    Note that all fields used here should be among existing columns in the dataset schema.
+    For hydra compatibility, we need to pass this filters as an str expression that'll be passed to `eval(...)`
+    """
+    filesystem_expr: Optional[str] = None
+    """
+    DEPRECATED : not used any more and will be remove soon
+    """
+    filesystem: Optional[Any] = None
+    """
+    DEPRECATED: not used any more and will be remove soon
+    """
+    split_to_row_groups: Optional[bool] = None
+    """If ``True``, uses Parquet row groups instead of simple partitions which
+    are generally smaller. Highly recommended for non-partitioned parquet files."""
+    nb_parallel_fragments: Optional[int] = None
+    """
+    This parameter can be dataset specific:
+    For dataset with large number of sentences per document (sample),
+    it's enough to set `nb_parallel_fragments=2 or 3`.
+    For datasets, with smaller number of sentences (~10) and small row_group_size (~200-600),
+     `nb_parallel_fragments` could be increase to 10 - 20.
+    The number of Parquet fragments allowed to be read in parallel. Higher
+    values will result in higher speeds, better randomization, and higher memory
+    footprint. If partition size is rather small compared to the batch size, we
+    recommend to increase ``nb_parallel_fragments``.
+    Leaving ``nb_parallel_fragments`` to None will trigger auto-detection based on dataset metadata.
+    """
+    sharding_in_memory: bool = False
+    """
+    This option should be activated for sharding small datasets whose total number of row groups is small
+    that makes sharding per row group impossible.
+    """
+    def __post_init__(self):
+        super().__post_init__()
+        if self.source_sequences is not None:
+            if self.source_column is not None:
+                logger.warning(
+                    f"Both `source_sequences` and `source_column` is specified. "
+                    f"Ignore `source_column` and use default value `{ColumnsNames.source_column.value}`.\n"
+                    f"(`source_sequences` = {self.source_sequences}, \n"
+                    f"`source_column` = {self.source_column} )"
+                )
+            assert self.source_quality_range is None
+            self.source_column = ColumnsNames.source_column.value
+        if self.target_sequences is not None:
+            if self.target_column is not None:
+                logger.warning(
+                    f"Both `target_sequences` and `target_column` is specified. "
+                    f"Ignore `target_column` and use default value `{ColumnsNames.target_column.value}`.\n"
+                    f"(`target_sequences` = {self.target_sequences}, \n"
+                    f"`target_column` = {self.target_column} )"
+                )
+            self.target_column = ColumnsNames.target_column.value
+        for col in (self.source_sequences or []) + (self.target_sequences or []):
+            if col.sonar_column is None and col.text_value is None:
+                assert col.text_column, f"Invalid SonarTextColumn: {col}"
+                col.sonar_column = col.text_column + "_sonar_emb"
+        if self.source_quality_range is None:
+            self.source_quality_column = None
+DatasetConfigT = TypeVar("DatasetConfigT", bound=DatasetConfig, contravariant=True)
+@dataclass
+class DataLoadingConfig:
+    multiple_dataset_chaining: str = "sample"
+    """
+    This option allows to chain several datasets together.
+    The chaining can be done in two ways:
+    - `sample` : each dataset will be sampled with the provided weight
+    - `concat` : datasets will be concatenated together (no weights taken into account)
+    - `round_robin`: datasets will be sampled in a round robin fashion (no weights taken into account)
+    """
+    batch_size: Optional[int] = None
+    """The output batch size."""
+    order_by_length: bool = True
+    """
+    Whether to create the batches with homogeneous tokens length
+    for more efficient padding.
+    """
+    max_tokens: Optional[int] = None
+    """Used with the ``order_by_length`` option to control the total number of
+    padded tokens in each batch. Typically, this option is preferred over
+    ``batch_size`` to reduce the memory footprint.
+    """
+    len_to_wrap_long_seq: Optional[int] = None
+    """
+    Wrapping a source sequences to the length of `len_to_wrap_long_seq`.
+    For instance, for a `len_to_wrap_long_seq=2`
+    batch = {
+        "source": [["v1", "v2", "v3", "v4", "v5"], ["u1", "u2", "u3"], ["w1"]],
+    }
+    will be transormed to
+    1. if packing is False :
+    batch = {
+        "source": [['v1', 'v2'], ['v3', 'v4'], ['v5'], ["u1", "u2"], ["u3"], ["w1"]]
+    }
+    1. if packing is True :
+    batch = {
+        "source": [['v1', 'v2'], ['v3', 'v4'], ['v5', 'u1'], ["u2", "u3"], ["w1"]]
+    }
+    Note: currently only allowed to be used with no "target" provided (unsupervised style) !
+    """
+    packing: bool = False
+    """
+    If True, all sequential documents (seqs of sentences) will be concated into one big document
+    before applying wrapping.
+    This will result in all samples (except maybe one) having exactly `len_to_wrap_long_seq` length !
+    """
+    wrap_before_affixing: bool = False
+    """
+    If True, we will wrap the sequences before adding the source prefix/suffix.
+    Recommended when pre-training with packed data i.e len_to_wrap_long_seq not None and packing=True
+    """
+    max_sentence_len_in_doc: Optional[int] = None
+    """
+    Remove samples (documents) whose `source_text_column` contains at least one sentence of len > `max_sentence_len_in_doc`.
+    This operations is done after long sequences wrapping (if applicable).
+    Typically values:  100 - 300
+    """
+    min_sentence_len_in_doc: Optional[int] = None
+    """
+    Remove samples (documents) `source_text_column` contains at least one sentence of len < `min_sentence_len_in_doc`.
+    This operations is done after long sequences wrapping (if applicable).
+    Typically values:  5 - 15
+    """
+    max_sentence_len_in_target_doc: Optional[int] = None
+    """
+    same filtering option as above but for `target_text_column`
+    """
+    min_sentence_len_in_target_doc: Optional[int] = None
+    """
+    same filtering option as above but for `target_text_column`
+    """
+    min_length_of_sequences: Optional[int] = 1
+    """
+    Remove samples (documents) whose `source_text_column` are scrictly shorter than `min_length_of_sequences`.
+    This operations is done after long sequences wrapping (if applicable).
+    One can use here the same value as for sequences wrapping
+    in order to produce all sequences with the same length.
+    """
+    min_length_of_sequences_after_batching: Optional[int] = 1
+    """
+    Remove source sequences shorter than `min_length_of_sequences_after_batching`
+    This filtering is applied after batching and potentially affixing and wrapping.
+    """
+    min_length_of_target_sequences: Optional[int] = 1
+    """
+    Same as above applied for `target_text_column`
+    """
+    min_length_of_target_sequences_after_batching: Optional[int] = 1
+    """
+    Same as above applied for `target_text_column`
+    """
+    output_format: ParquetBatchFormat = ParquetBatchFormat.torch
+    """The format to use for output batches."""
+    shuffle: bool = True
+    """If ``True``, shuffles the dataset samples during the iteration. If ``False``
+    and ``order_by_length`` is ``None``, the batch samples will be produced in
+    natural Parquet dataset reading order."""
+    drop_null: bool = True
+    """If ``True``, drops rows containing any null value."""
+    seed: int = 123
+    """The RNG seed value for deterministic behavior."""
+    nb_epochs: int = 100
+    """
+    Number of passes over the data before iterations stop
+    """
+    min_batch_size: int = 1
+    """Drops batches whose length is less than ``min_batch_size``"""
+    nb_prefetch: float = 3.0
+    """The number of producer groups (of size `nb_parallel_fragments`) to
+    prefetch."""
+    num_parallel_calls: float = 1.5
+    """The number of parallel calls in map operations."""
+    use_threads: bool = False
+    """Whether pyarrow should use its internal threads to read the Parquet file.
+    Since we rely on the external parallelism, this param is tuned off by
+    default."""
+    ignore_checkpointed_pipeline: bool = False
+    """Whether to ignore the saved datapipeline state or load it when resuming.
+    Temporary fix for issues re-loading saved checkpoints"""
+    even_sharding: bool = False
+    """
+    This option should be activated ONLY for validataion on small datasets
+    to guarantee the perfect data sharding accross the workers.
+    Note that in current impmentation, activating `even_sharding` requires `sharding_in_memory=True`
+    which will lead to big overhead for big dataset.
+    Note also that some fraction of the data may be dropped due to even sharding.
+    For big validation datasets, prefer using large `nb_epoch` + limiting `max_validation_iterations`
+    instead of using `even_sharding` !
+    For training use case, it should left to False and combined with large number of epochs.
+    For evaluation use case, it also should be False since we dont care about the batch syncronization across different workers.
+    """
+    max_iteration_steps: Optional[int] = None
+    """
+    If not None, it will be used to limit the number of batches produced per each dataset
+    """
+@dataclass
+class ValidationDataLoadingConfig(DataLoadingConfig):
+    """
+    This class allows to have some hardcoded parameters for data loading of validation datasets
+    """
+    multiple_dataset_chaining: str = "concat"
+    nb_epochs: int = 1
+    min_batch_size: int = 1  # we want to keep all samples
+    shuffle: bool = False  # we dont need the randomness here
+    batch_size: Optional[int] = None
+    max_tokens: Optional[int] = None
+    """
+    Leaving both `max_tokens` and `batch_size` to None will trigger auto-detection based on dataset metadata and distributed training world size.
+    to make more or less even distribution of samples across workers. Typically,
+    if worker_batch_size = total_batch_size // world_size <= 40, we will use batch_size=worker_batch_size,
+    otherwise we will use max_tokens=min(total_tokens_number // world_size, 3000).
+    See dataloading:SingleParquetDatasetDataloader::set_validation_params for more details.
+    """
+@dataclass
+class EvaluationDataLoadingConfig(DataLoadingConfig):
+    """
+    This class allows to have some hardcoded parameters for data loading of evaluation datasets.
+    In partitcular, even in distributed setup evaluation should not require workers syncronization.
+    Therefore, we set `even_sharding` = False to get the all data samples !
+    """
+    multiple_dataset_chaining: str = "concat"
+    nb_epochs: int = 1  # only ONE full pass over the full data !
+    min_batch_size: int = 1  # we want to keep all samples
+    shuffle: bool = False  # we dont need the randomness here
+    batch_size: Optional[int] = 10
+    max_tokens: Optional[int] = None  # this should be ok for most of models
+    even_sharding: bool = False  # we dont want to lose any sample !
+    sharding_in_memory: bool = True  # activate sharding by rank and world size
+    rank: int = 0
+    world_size: int = 1
+    max_samples: Optional[int] = None  # fmt: skip
+    """evaluate only the first n samples (for debugging)"""
+def setup_fairseq2_extensions() -> None:
+    # path where all datacards should be located !
+    cards_dir = Path(__file__).parent.parent.joinpath("datacards")
+    if cards_dir.exists():
+        default_asset_store.add_file_metadata_provider(cards_dir)
+setup_fairseq2_extensions()
+def get_cluster() -> Optional[str]:
+    """Returns the cluster name of the current environment.
+    User can implement their own logic to load datasets living in different locations/clusters
+    """
+    return "s3"
+def _resolve_parquet_path(options: Dict[str, str]) -> Optional[str]:
+    cluster_name = get_cluster() or "s3"
+    parquet_path = options.get(cluster_name)
+    if parquet_path is None:
+        # best effort - taking first element
+        parquet_path = next(iter(options.values()))
+    return parquet_path
+def _resolve_filters(
+    split: Optional[str],
+    card_filter: Optional[str],
+    user_filter: Optional[str],
+    card_partition_filters: Optional[str],
+    user_partition_filters: Optional[str],
+) -> Tuple[Optional[pc.Expression], Optional[pc.Expression]]:
+    custom_filters = user_filter or card_filter
+    partition_filters = user_partition_filters or card_partition_filters
+    if custom_filters is not None:
+        custom_filters = pq.filters_to_expression(eval(custom_filters))
+    if partition_filters is not None:
+        partition_filters = pq.filters_to_expression(eval(partition_filters))
+    if split:
+        split_filter = pc.equal(pc.field("split"), split)
+        if partition_filters is None:
+            partition_filters = split_filter
+        else:
+            partition_filters = pa.compute.if_else(
+                split_filter, partition_filters, False
+            )
+    return custom_filters, partition_filters
+def _default_resolver(a, b):
+    res = a if bool(a) and a is not MISSING else b
+    return res
+def get_parquet_config_from_name(
+    name: str, config: Optional[ParquetDatasetConfig] = None
+) -> ParquetDatasetConfig:
+    """
+    name convention is the following
+    -  {card_name}={split}:{weight}
+    """
+    # parsing name
+    pattern = r"^(?P<card_name>[a-zA-Z0-9_]+)=?(?P<split>[a-zA-Z0-9_]*)?:?(?P<weight>\d+(?:\.\d+)?)?$"
+    match_ = re.match(pattern, name)
+    assert match_ is not None, f"name parsing failed: {name}"
+    card_name = match_.group("card_name")
+    split = match_.group("split")
+    weight = match_.group("weight")
+    if weight:
+        weight = float(weight)
+    logger.info(
+        f"Parsing {name} : card_name={card_name}, split={split}, weight={weight}"
+    )
+    reload_config = default_asset_store.retrieve_card(card_name)
+    cards_metadata: Dict[str, Any] = {**reload_config._metadata}
+    if config is None:
+        config = ParquetDatasetConfig(name=card_name, parquet_path="")
+    assert config is not None
+    if isinstance(config, ParquetDatasetConfig):
+        config_dict = asdict(config)
+    else:
+        config_dict = config  # type: ignore
+    metadata = {}
+    # resolve parquet_path according to the cluster
+    for field in fields(ParquetDatasetConfig):
+        field_name = field.name
+        metadata[field_name] = _default_resolver(
+            config_dict.get(field_name), cards_metadata.get(field_name)
+        )
+    if isinstance(metadata["source_sequences"], list):
+        metadata["source_sequences"] = [
+            SonarTextColumn(**item) for item in metadata["source_sequences"]
+        ]
+    if isinstance(metadata["target_sequences"], list):
+        metadata["target_sequences"] = [
+            SonarTextColumn(**item) for item in metadata["target_sequences"]
+        ]
+    metadata["parquet_path"] = _default_resolver(
+        config_dict.get("parquet_path"),
+        _resolve_parquet_path(cards_metadata["parquet_path"]),
+    )
+    metadata["filters"], metadata["partition_filters"] = _resolve_filters(
+        split,
+        card_filter=cards_metadata.get("filters"),
+        user_filter=config_dict.get("filters"),
+        card_partition_filters=cards_metadata.get("partition_filters"),
+        user_partition_filters=config_dict.get("partition_filters"),
+    )
+    if weight:  # priority from parsed name
+        metadata["weight"] = weight
+    metadata["name"] = name
+    # to patch nested hydra case !
+    if metadata["limit"] is not None and isinstance(metadata["limit"], dict):
+        metadata["limit"] = ParquetDatasetLimitOptions(**metadata["limit"])
+    return ParquetDatasetConfig(**metadata)
+def create_dataset_config_from_cards(
+    config: DatasetConfig,
+) -> DatasetConfig:
+    if getattr(config, "name", None) is None:
+        return config
+    output_config = get_parquet_config_from_name(config.name, config)  # type: ignore
+    return output_config
+def get_renaming_mappers(configs: Sequence[DatasetConfig]) -> List[dict]:
+    used_columns = [x for x in ColumnsNames.__members__ if x != "dataset_name"]
+    pre_mapping = {
+        att: [getattr(cc, att) for cc in configs if hasattr(cc, att)]
+        for att in used_columns
+    }
+    mappers: List[dict] = [{} for _ in configs]
+    for att, val in pre_mapping.items():
+        if all(x is None for x in val):
+            continue
+        for i, name in enumerate(val):
+            if name is None:
+                raise ValueError(
+                    f"All datasets should provide {att} param, but got {configs[i]}"
+                )
+            mappers[i][name] = getattr(ColumnsNames, att).value
+    return mappers

lcm/datasets/dataloader.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+import gc
+import logging
+from copy import deepcopy
+from functools import partial
+from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple
+import pyarrow.compute as pc
+import torch
+from fairseq2.data.data_pipeline import DataPipeline, read_sequence
+from fairseq2.data.text import TextTokenizer
+from fairseq2.gang import FakeGang, Gang
+from fairseq2.models.sequence import SequenceBatch
+from fairseq2.nn.padding import pad_seqs
+from fairseq2.typing import DataType
+from fairseq2.utils.state import Stateful
+from sonar.models.sonar_text import load_sonar_tokenizer
+from lcm.datasets.base import DataLoader
+from lcm.datasets.batch import LCMInput
+from lcm.datasets.configs import (
+    ColumnsNames,
+    DataLoadingConfig,
+    ParquetDatasetConfig,
+    ParquetDatasetLimitOptions,
+    SonarDecoderConfig,
+)
+from lcm.datasets.utils import move_eos_to_the_end
+from lcm.utils.common import set_mkl_num_threads
+logger = logging.getLogger(__name__)
+def truncate_sequence(tokens: torch.Tensor, max_len: int = 512) -> torch.Tensor:
+    if len(tokens) > max_len:
+        return tokens[:max_len]
+    return tokens
+class LCMDataLoader(DataLoader[LCMInput, ParquetDatasetConfig], Stateful):
+    def __init__(
+        self,
+        data_config: DataLoadingConfig,
+        datasets: Sequence[ParquetDatasetConfig],
+        dtype: DataType = torch.float16,
+        use_decoder_backprop: bool = False,
+        max_subword_length: int = 64,
+        gang: Gang = None,
+        sonar_decoder_config: Optional[SonarDecoderConfig] = None,
+    ) -> None:
+        gang = gang or FakeGang()
+        super().__init__(
+            data_config=data_config,
+            datasets=datasets,
+            dtype=dtype,
+            gang=gang,
+        )
+        set_mkl_num_threads()
+        self.use_decoder_backprop = use_decoder_backprop
+        self.sonar_tokenizer: Optional[TextTokenizer] = None
+        self.max_subword_length = max_subword_length
+        if sonar_decoder_config is not None:
+            self.setup_sonar_decoder_tokenizer(config=sonar_decoder_config)
+        self._dummy_example: Optional[LCMInput] = None
+    def setup_sonar_decoder_tokenizer(
+        self,
+        config: SonarDecoderConfig,
+    ):
+        if self.use_decoder_backprop:
+            # The tokenizer
+            self.tokenizer = load_sonar_tokenizer(config.tokenizer, progress=False)
+            # Target text encoder
+            self.sonar_tokenizer = self.tokenizer.create_encoder(
+                task="translation",
+                lang=config.lang,
+                mode="target",
+                device=self.gang.device,
+            )
+        else:
+            self.sonar_tokenizer = None
+    def _prepare_subword_tokens(
+        self, batch: Dict[str, Any]
+    ) -> Tuple[Optional[SequenceBatch], Optional[SequenceBatch]]:
+        """
+        Given a batch of paragraphs/documents,
+        prepare a batch of sentences (flattened) tokenized at the subword-level
+        to feed to the SONAR decoder (a standard token-level decoder)
+        Args:
+            batch: attributes of a batch from the dataset.
+                    A batch is M documents/paragraphs each spanning
+                    a variable number of sentences {N_1, ..., N_M}.
+            E.g., {'text_sentences': [[sent^1_1, ...sent^1_{N_1}],
+                                        ...[sent^M_1, ... sent^M_{N_M}],
+                  'text_sentences_sonar_emb': [X^1 in (N_1, D), ... X^M in (N_M, D)]}
+                  where D is the sonar embedding dimension.
+        Returns:
+            Toeknized sentences (subword-level) in (\sum_i=1^M N_i, max_len)
+            where max_len is min(self.max_subword_length, max length of the sentences in the batch)
+        """
+        if not self.use_decoder_backprop:
+            return None, None
+        # flatten the sentences from different documents/paragraphs
+        flattened_source_text = (
+            pc.list_flatten(batch[ColumnsNames.source_text_column.value])
+            .to_pandas()
+            .values
+        )
+        pipeline: DataPipeline = (
+            read_sequence(flattened_source_text)
+            .map(
+                [
+                    self.sonar_tokenizer,  # type: ignore
+                    partial(truncate_sequence, max_len=self.max_subword_length),
+                ],
+                num_parallel_calls=int(max(8 * self.data_config.num_parallel_calls, 1)),
+            )
+            .and_return(max_num_warnings=4)
+        )
+        tokens_seqs, tokens_padding_mask = pad_seqs(list(pipeline))  # type: ignore
+        prefix_batch = SequenceBatch(tokens_seqs, tokens_padding_mask)
+        # TODO: instead of moving the EOS around, make the tokenizer append at the tokenization.
+        target_batch = move_eos_to_the_end(
+            prefix_batch,
+            eos_token_id=self.tokenizer.vocab_info.eos_idx,
+            pad_token_id=self.tokenizer.vocab_info.pad_idx,
+        )
+        return prefix_batch, target_batch
+    def _tokenize_batch(self, batch: Dict[str, Any]) -> LCMInput:
+        """
+        Given a batch of documents,
+        prepare a batch of input features for the LCM
+        This step is to simply fetch the right column for source/target & source text
+        and convert torch NestedTensors to list of tensors
+        Args:
+            batch: attributes of a batch from the dataset.
+                    A batch is M documents each spanning
+                    a variable number of sentences {N_1, ..., N_M}.
+            E.g., {'text_sentences': [[sent^1_1, ...sent^1_{N_1}],
+                                        ...[sent^M_1, ... sent^M_{N_M}],
+                  'text_sentences_sonar_emb': [X^1 in (N_1, D), ... X^M in (N_M, D)]}
+                  where D is the sonar embedding dimension.
+        Returns:
+            LCMInput(
+            source: SONAR embeddings of the source text
+                i.e [X^1 in (N_1, D), ... X^M in (N_M, D)]
+            target: If supervised data:  SONAR embeddings of the source text
+            tokens: Tokenized flattened sentences for the SONAR decoder (see `_prepare_subword_tokens`)
+            )
+        """
+        # Prepare sentence-wise subword tokens if needed:
+        tokens, target_tokens = self._prepare_subword_tokens(batch)
+        # Load target embeddings if requested and to propagate all other embeddings
+        possible_emb_columns = {
+            "source": ColumnsNames.source_column,
+            "target": ColumnsNames.target_column,
+        }
+        outputs = {
+            "tokens": tokens,
+            "target_tokens": target_tokens,
+            "name": batch[ColumnsNames.dataset_name.value],
+            "batch": batch,
+        }
+        for key, col in possible_emb_columns.items():
+            col_name = col.value
+            if col_name in batch:
+                dtype = self.dtype if "_length" not in key else torch.int64
+                embs = [x.to(self.gang.device).to(dtype) for x in batch[col_name]]
+                # Special case when some embeddings are not shaped as (T, D) e.g., XLMC's answer columns
+                if embs[0].dim() == 1 and "_length" not in key:
+                    embs = [t.unsqueeze(0) for t in embs]
+            else:
+                embs = None
+            outputs[key] = embs
+        assert outputs["source"] is not None, (
+            "LCMDataLoader requires `source` sequences to be present in batches"
+        )
+        return LCMInput(**outputs)
+    def iterate_batches(self) -> Iterator[LCMInput]:
+        yield from map(self._tokenize_batch, self.pipeline)
+    def iterate_dummy_batches(self) -> Iterator[LCMInput]:
+        """
+        it's needed to simulate the data that follows the strucutre of self.pipeline (by always returning the same element).
+        It can be used only for fast forward pass (to avoid uneven sharding multi-gpus training).
+        """
+        if self._dummy_example is None:
+            # patching the params to get less data with less cost
+            limited_datasets = deepcopy(self.datasets)
+            for ds_conf in limited_datasets:
+                assert isinstance(ds_conf, ParquetDatasetConfig)
+                ds_conf.limit = ParquetDatasetLimitOptions(nb_fragments=1)
+            # Copy the true data config and reduce the batch size.
+            # When wrapping data, we want to also wrap the dummy batches
+            # to not exceed model max_length
+            dummy_dataloading_config = deepcopy(self.data_config)
+            dummy_dataloading_config.batch_size = 1
+            self._dummy_example = self._tokenize_batch(
+                next(
+                    iter(
+                        self.builder_func(
+                            limited_datasets, dummy_dataloading_config, 0, 1
+                        )
+                    )
+                )
+            )
+        gc.collect()
+        while True:
+            yield self._dummy_example
+    def state_dict(self) -> Dict[str, Any]:
+        logger.info("Getting the data pipeline state ...")
+        state = self.pipeline.state_dict(strict=False)
+        return state
+    def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
+        if state_dict is not None:
+            assert self.pipeline is not None
+            if self.data_config.ignore_checkpointed_pipeline:
+                logger.warning("Ignoring existing dataloader state")
+            else:
+                try:
+                    self.pipeline.load_state_dict(state_dict)
+                    logger.info(f"Reloaded datapipeline state: {str(state_dict)[:400]}")
+                except ValueError:
+                    logger.warning(
+                        f"Failed to load dataloader state: {str(state_dict)[:400]}"
+                    )
+        else:
+            # retro-compatibility
+            logger.warning(f"Attempt to restore a dataloader {self} with empty state")

lcm/datasets/dataloading.py ADDED Viewed

	@@ -0,0 +1,1109 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+import logging
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from functools import lru_cache, partial
+from typing import Any, Generator, List, Optional, Sequence
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+import pyarrow.parquet as pq
+from fairseq2.data.data_pipeline import DataPipeline, DataPipelineBuilder
+from fairseq2.data.parquet.tools import BatchOutputType, apply_filter, concat_table
+from pyarrow.dataset import get_partition_keys
+from stopes.utils.arrow_utils import (
+    explode_table_with_fixed_length,
+    explode_table_with_max_length,
+    is_list_like,
+)
+from lcm.datasets.configs import (
+    DataLoadingConfig,
+    ParquetBatchFormat,
+    ParquetDatasetConfig,
+    ValidationDataLoadingConfig,
+    get_renaming_mappers,
+)
+from lcm.datasets.parquet_utils import (
+    build_batching_loop_over_one_table,
+    define_parquet_dataset,
+    filter_document_by_quality,
+    filter_long_short_sentence_document,
+    filter_table_with_different_lengths,
+    get_row_group_level_metadata,
+    materialize_sequence,
+    prefix_and_suffix_one_list_column,
+    prepare_suffix_prefix_embeddings,
+    pyarrow_table_to_torch_dict,
+    renaming,
+    shuffle_table,
+    stream_parquet_fragments,
+)
+logger = logging.getLogger(__name__)
+PA_NB_CPU = 4
+pa.set_cpu_count(PA_NB_CPU)
+pa.set_io_thread_count(PA_NB_CPU)
+def return_none_on_failure(func):
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return None
+    return wrapper
+@dataclass
+class GlobalPQStats:
+    min_number_of_fragment: int
+    mean_fragment_length: float
+    mean_fragment_number_of_tokens: Optional[float] = None
+class SingleParquetDatasetDataloader:
+    _pq_ds: Optional[pq.ParquetDataset] = None
+    proxy_number_of_fragments: int
+    basic_stats: GlobalPQStats
+    def __init__(
+        self, dataset_config: ParquetDatasetConfig, loading_config: DataLoadingConfig
+    ):
+        self.dataset_config = deepcopy(dataset_config)
+        self.loading_config = deepcopy(loading_config)
+        self.config_post_init()
+        nb_parallel_fragments = self.dataset_config.nb_parallel_fragments
+        assert isinstance(nb_parallel_fragments, int)
+        self.nb_parallel_fragments: int = nb_parallel_fragments
+    @property
+    def is_validation(self) -> bool:
+        return isinstance(self.loading_config, ValidationDataLoadingConfig)
+    def head(self, top=5):
+        return self.dataset._dataset.head(top)
+    @property
+    def dataset(self) -> pq.ParquetDataset:
+        if self._pq_ds is None:
+            self._pq_ds = self.get_dataset()
+        return self._pq_ds
+    @property
+    def full_schema(self) -> pa.Schema:
+        return self.dataset.schema
+    def _warn_filters_usage(self, pq_ds: pq.ParquetDataset) -> None:
+        partition_filters = self.dataset_config.partition_filters
+        frags = pq_ds.fragments
+        if len(frags) == 0:
+            raise ValueError(
+                f"Working on empty dataset, probably due to wrong `partition_filters` definition : {partition_filters}"
+            )
+        partition_columns = list(
+            get_partition_keys(frags[0].partition_expression).keys()
+        )
+        if not partition_columns and partition_filters is not None:
+            raise ValueError(
+                f"Partition filters {partition_filters} is set but dataset has NO partition columns"
+            )
+        if partition_columns and partition_filters is not None:
+            expression_candidates = [
+                x for x in partition_columns if x in str(partition_filters)
+            ]
+            if len(expression_candidates) == 0:
+                logger.warning(
+                    f"Partition filters are NOT compatible with partition columns, got: "
+                    f"partition_filters={partition_filters} and partition_columns={partition_columns}"
+                )
+        filters = self.dataset_config.filters
+        if partition_columns and filters is not None:
+            expression_candidates = [x for x in partition_columns if x in str(filters)]
+            if len(expression_candidates) > 0:
+                logger.warning(
+                    f"Partitionning columns {expression_candidates} are used as `filters` {filters}. ",
+                    "You may want to use them in `partition_filters` instead",
+                )
+    def get_dataset(self) -> pq.ParquetDataset:
+        if isinstance(self.dataset_config.filters, str):
+            self.dataset_config.filters = pq.filters_to_expression(
+                eval(self.dataset_config.filters)
+            )
+        if isinstance(self.dataset_config.partition_filters, str):
+            self.dataset_config.partition_filters = pq.filters_to_expression(
+                eval(self.dataset_config.partition_filters)
+            )
+        pq_ds = define_parquet_dataset(
+            str(self.dataset_config.parquet_path), self.dataset_config.partition_filters
+        )
+        try:
+            self._warn_filters_usage(pq_ds)
+        except Exception as e:
+            logger.info(f"getting exception during filters examination : {e}")
+        return pq_ds
+    def set_validation_params(
+        self,
+        world_size: int,
+        default_max_tokens: int = 3000,
+        default_batch_size: int = 40,
+    ) -> None:
+        if not (
+            self.loading_config.batch_size is None
+            and self.loading_config.max_tokens is None
+        ):
+            return
+        total_batch_size = int(
+            self.basic_stats.min_number_of_fragment
+            * self.basic_stats.mean_fragment_length
+        )
+        batch_size = total_batch_size // world_size + int(
+            total_batch_size % world_size != 0
+        )
+        # for small datasets we can set `batch_size`
+        if (
+            batch_size <= default_batch_size
+            or self.basic_stats.mean_fragment_number_of_tokens is None
+        ):
+            self.loading_config.batch_size = min(batch_size, default_batch_size)
+            self.loading_config.max_tokens = None
+        else:
+            # for bigger dataset, let's use `max_tokens`
+            self.loading_config.batch_size = None
+            total_tokens_number = int(
+                self.basic_stats.min_number_of_fragment
+                * self.basic_stats.mean_fragment_number_of_tokens
+            )
+            self.loading_config.max_tokens = min(
+                max(total_tokens_number // world_size, 1), default_max_tokens
+            )
+    def build_dataload_pipeline(
+        self, rank: int = 0, world_size: int = 1
+    ) -> DataPipelineBuilder:
+        if world_size > 1:
+            assert self.loading_config.seed is not None, (
+                "for distributed training with `world_size` > 1,  `seed` should be set !"
+            )
+        if self.is_validation:
+            self.set_validation_params(world_size)
+        # to propagate sharding_in_memory
+        if not self.dataset_config.sharding_in_memory:
+            sharding_in_memory = (
+                self.loading_config.nb_epochs * self.proxy_number_of_fragments
+                < 2 * world_size
+            )
+        else:
+            sharding_in_memory = self.dataset_config.sharding_in_memory
+        if self.loading_config.even_sharding:
+            sharding_in_memory = True
+        if sharding_in_memory:
+            logger.info("Activating sharding_in_memory")
+        self.random_state = np.random.RandomState(
+            self._get_inner_seed(rank, sharding_in_memory)
+        )
+        pipeline = self.get_fragments_pipeline()
+        if not sharding_in_memory:
+            pipeline = pipeline.shard(
+                shard_idx=rank,
+                num_shards=world_size,
+                allow_uneven=not self.loading_config.even_sharding,
+            )
+        pipeline = self.add_basic_fragment_loading_pipeline(pipeline)
+        pipeline = self.create_on_the_fly_columns(pipeline)
+        pipeline = self.filter_by_aligned_length(pipeline)
+        # If we want to wrap before adding affixes
+        if self.loading_config.wrap_before_affixing:
+            pipeline = self.add_wrapping_to_max_length_pipeline(pipeline)
+        # Filtering
+        pipeline = self.add_quality_score_filters(pipeline)
+        pipeline = self.add_min_sentence_number_in_doc_filter(
+            pipeline,
+            min_source_length=self.loading_config.min_length_of_sequences,
+            min_target_length=self.loading_config.min_length_of_target_sequences,
+        )
+        pipeline = self.add_min_max_sentence_len_in_doc_filter(pipeline)
+        # Affix
+        pipeline = self._add_source_target_affixes_to_pipeline(pipeline)
+        def cost_fn(table) -> float:
+            cost = 0
+            for name in [
+                self.dataset_config.source_column,
+                self.dataset_config.target_column,
+            ]:
+                if name is not None:
+                    col = table[name]
+                    if is_list_like(col):
+                        cost += pa.compute.list_value_length(col).to_numpy().sum()
+                    else:
+                        # we should not be there, but let take batch_size as a proxy
+                        cost += len(col)
+            return cost
+        pipeline = pipeline.dynamic_bucket(
+            self._shuffling_tokens_size,
+            cost_fn,
+            min_num_examples=self.nb_parallel_fragments,
+            max_num_examples=100,  # max number of small fragements
+            drop_remainder=False,
+        )
+        pipeline = pipeline.map(concat_table, num_parallel_calls=1)
+        # wrap documents after affixing
+        if not self.loading_config.wrap_before_affixing:
+            # Note that packing with proper attention masks and position codes requires
+            # document indices that cover all sentences. Currently this can only come from affixing before wrapping.
+            # Adding affixes after wrapping will require annexing these affixes to edge sentences which is not intuitive.
+            if self.loading_config.shuffle:
+                pipeline = pipeline.map(
+                    partial(shuffle_table, random_state=self.random_state),
+                    num_parallel_calls=1,
+                )
+            pipeline = self.add_wrapping_to_max_length_pipeline(pipeline)
+        # batch with batch_size or max_tokens
+        pipeline = self.add_inner_pipeline(pipeline)
+        # Filter once again after wrapping and batching to remove batches with few number sentences
+        pipeline = self.add_min_sentence_number_in_doc_filter(
+            pipeline,
+            min_source_length=self.loading_config.min_length_of_sequences_after_batching,
+            min_target_length=self.loading_config.min_length_of_target_sequences_after_batching,
+        )
+        # Remove batch sizes with a size smaller than min_batch_size (default=1)
+        pipeline = pipeline.filter(
+            lambda table: bool(len(table) >= self.loading_config.min_batch_size)
+        )
+        if sharding_in_memory:
+            pipeline = pipeline.shard(
+                shard_idx=rank,
+                num_shards=world_size,
+                allow_uneven=not self.loading_config.even_sharding,
+            )
+        if self.loading_config.max_iteration_steps is not None:
+            pipeline = pipeline.take(self.loading_config.max_iteration_steps)
+        pipeline = self.add_format_conversion(pipeline)
+        return pipeline
+    def create_on_the_fly_columns(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        if self.dataset_config.source_sequences is not None:
+            assert self.dataset_config.source_column is not None, (
+                f"Expected a source_column - found {self.dataset_config.source_column}"
+            )
+            assert self.dataset_config.source_text_column is not None, (
+                f"Expected a source_text_column - found {self.dataset_config.source_text_column}"
+            )
+            pipeline = pipeline.map(
+                partial(
+                    materialize_sequence,
+                    column_sequence=self.dataset_config.source_sequences,
+                    vector_name=self.dataset_config.source_column,
+                    text_name=self.dataset_config.source_text_column,
+                ),
+                num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+            )
+        if self.dataset_config.target_sequences is not None:
+            assert self.dataset_config.target_column is not None, (
+                f"Expected a target_column, found {self.dataset_config.target_column}"
+            )
+            assert self.dataset_config.target_text_column is not None, (
+                f"Expected a target_text_columns, found {self.dataset_config.target_text_column}"
+            )
+            pipeline = pipeline.map(
+                partial(
+                    materialize_sequence,
+                    column_sequence=self.dataset_config.target_sequences,
+                    vector_name=self.dataset_config.target_column,
+                    text_name=self.dataset_config.target_text_column,
+                ),
+                num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+            )
+        columns_to_drop = list(
+            set(self._get_sequences_columns()) - set(self.extra_required_columns)
+        )
+        if columns_to_drop:
+            pipeline = pipeline.map(lambda table: table.drop(columns_to_drop))
+        return pipeline
+    def _add_source_target_affixes_to_pipeline(self, pipeline) -> DataPipelineBuilder:
+        # prefixing/suffixing before wrapping/packing
+        ps_vals = self._get_suffix_prefix_vector()
+        pipeline = self.add_prefix_suffix_pipeline(
+            pipeline,
+            self.dataset_config.source_column,
+            ps_vals["source_prefix_vector"],
+            ps_vals["source_suffix_vector"],
+        )
+        pipeline = self.add_prefix_suffix_pipeline(
+            pipeline,
+            self.dataset_config.source_text_column,
+            ps_vals["source_prefix_sentences"],
+            ps_vals["source_suffix_sentences"],
+        )
+        pipeline = self.add_prefix_suffix_pipeline(
+            pipeline,
+            self.dataset_config.source_quality_column,
+            (
+                pa.array([None])
+                if self.dataset_config.source_prefix_text
+                else pa.array([])
+            ),
+            (
+                pa.array([None])
+                if self.dataset_config.source_suffix_text
+                else pa.array([])
+            ),
+        )
+        pipeline = self.add_prefix_suffix_pipeline(
+            pipeline,
+            self.dataset_config.target_column,
+            ps_vals["target_prefix_vector"],
+            ps_vals["target_suffix_vector"],
+        )
+        pipeline = self.add_prefix_suffix_pipeline(
+            pipeline,
+            self.dataset_config.target_text_column,
+            ps_vals["target_prefix_sentences"],
+            ps_vals["target_suffix_sentences"],
+        )
+        return pipeline
+    def _num_parallel_call(self, x: float) -> int:
+        return int(max(self.loading_config.num_parallel_calls * x, 1))
+    def _nb_prefetch(self, x: float) -> int:
+        return int(max(self.loading_config.nb_prefetch * x, 0))
+    def config_post_init(self) -> None:
+        if getattr(self.loading_config, "len_to_wrap_long_seq", None):
+            if (
+                self.dataset_config.target_column
+                or self.dataset_config.target_text_column
+            ):
+                raise ValueError(
+                    "Using `len_to_wrap_long_seq` is not supported for suppervised training"
+                )
+        if self.loading_config.even_sharding:
+            assert self.loading_config.seed is not None, (
+                "`even_sharding` sharding requires to seed to be set"
+            )
+        if self.loading_config.max_tokens == 0:
+            self.loading_config.max_tokens = None
+        # setting max_tokens=0 turns off this option (argparser won't accept None directly)
+        if (self.loading_config.batch_size is None) == (
+            self.loading_config.max_tokens is None
+        ) and (not self.is_validation or self.loading_config.max_tokens is not None):
+            raise ValueError(
+                f"Need to provide either `batch_size` or `max_tokens` - \
+                Received batch_size={self.loading_config.batch_size} \
+                and max_tokens={self.loading_config.max_tokens}"
+            )
+        if self.loading_config.max_tokens and not self.dataset_config.source_column:
+            raise ValueError(
+                "Cannot batch based on `max_tokens` when `source_column` is not specified, "
+                "please use `batch_size` instead."
+            )
+        self.dataset_config.split_to_row_groups = (
+            self.dataset_config.split_to_row_groups
+            if self.dataset_config.split_to_row_groups is not None
+            else True
+        )
+        self.extra_required_columns = self.dataset_config.columns or []
+        self.dataset_config.override_attr("columns", self._get_minimal_columns())
+        logger.info(f"Following columns will be loaded: {self.dataset_config.columns}")
+        self.basic_stats = self.compute_stats()
+        self._shuffling_tokens_size = self._get_shuffling_tokens_size(self.basic_stats)
+        logger.info(
+            f"Bucketing will require at least: {self._shuffling_tokens_size} of tokens (source + target)"
+        )
+        logger.info(f"Dataset stats: {asdict(self.basic_stats)}")
+        self.proxy_number_of_fragments = self.basic_stats.min_number_of_fragment
+        if self.dataset_config.nb_parallel_fragments is None:
+            self.dataset_config.nb_parallel_fragments = (
+                self._find_nb_parallel_fragments(self.basic_stats)
+            )
+        logger.info(f"Dataset Config: {self.dataset_config}")
+        logger.info(f"Using Loading Config: {self.loading_config}")
+    def _get_shuffling_tokens_size(self, basic_stats) -> int:
+        """
+        `_shuffling_tokens_size` is used in dynamic bucketing to determine how many small parquet tables
+        (which are loaded raw parquet fragments that were potentially filtered on-the-fly) will be merged together :
+        we'll get a such number of consecutive parquet tables so that their total number of tokens (sentences)
+        will be greater than `_shuffling_tokens_size`.
+        It's called "shuffling" because all merged documents (from different tables) will be permuated together (if `shuffle=True`)
+        before being returned as final small batches (of required shape or volume).
+        The formula behind `_shuffling_tokens_size` is the following:
+        - If we use `max_tokens` in config, we want to have a least _shuffling_tokens_size = 4 * max_tokens,
+            so that at least 4 full batch will be formed next. It's good for shuffling and to avoid having "remainders" too often.
+        - For wrapping/packing case, we use a proxy for `max_tokens` as `batch_size` * `len_to_wrap_long_seq`
+        - If not, some average fragment characteristic `mean_fragment_number_of_tokens`, multiplied by 1.5 to get on average >=2 tables
+        - Finally, if no, other info is available, we use 10_000 as arbitrary proxy (good typical value for many of our datasets).
+        """
+        if self.loading_config.max_tokens is not None:
+            return 4 * self.loading_config.max_tokens
+        if (
+            self.loading_config.batch_size is not None
+            and self.loading_config.len_to_wrap_long_seq is not None
+        ):
+            return (
+                4
+                * self.loading_config.len_to_wrap_long_seq
+                * self.loading_config.batch_size
+            )
+        if basic_stats.mean_fragment_number_of_tokens is not None:
+            return int(
+                1.5 * basic_stats.mean_fragment_number_of_tokens
+            )  # to get few fragments grouped together
+        return 10_000  # default number that should not take a lot of RAM
+    def _find_nb_parallel_fragments(
+        self, basic_stats: GlobalPQStats, max_fragments=20, min_fragments=2
+    ) -> int:
+        """
+        Experimental!
+        Allows to determine nb of parallel fragments to load base on simple rules and dataset row group stats.
+        In particular, if `nb_parallel_fragments` will increase with increasing batch_size of max_tokens.
+        """
+        if basic_stats.min_number_of_fragment < 3:
+            return basic_stats.min_number_of_fragment
+        if basic_stats.mean_fragment_number_of_tokens is None:
+            logger.warning(
+                f"Cannot get `mean_fragment_number_of_tokens` from dataset {self.dataset_config}, `nb_parallel_fragement` detection can be wrong",
+            )
+        mean_fragment_number_of_tokens = (
+            basic_stats.mean_fragment_number_of_tokens or 5000
+        )  # typical, but arbitrary value
+        if (
+            self.loading_config.batch_size is None
+            and self.loading_config.max_tokens is None
+        ):
+            # it can happen for evaluation
+            nb_frags = 1.0
+        elif self.loading_config.batch_size is not None:
+            if self.loading_config.len_to_wrap_long_seq is not None:
+                max_tokens = (
+                    self.loading_config.len_to_wrap_long_seq
+                    * self.loading_config.batch_size
+                )
+                nb_frags = 3 * max_tokens / mean_fragment_number_of_tokens
+            else:
+                nb_frags = (
+                    5
+                    * self.loading_config.batch_size
+                    / basic_stats.mean_fragment_length
+                )
+        elif self.loading_config.max_tokens is not None:
+            nb_frags = (
+                3 * self.loading_config.max_tokens / mean_fragment_number_of_tokens
+            )
+        return max(min(max_fragments, round(nb_frags)), min_fragments)
+    @lru_cache
+    def _get_sequences_columns(self):
+        candidate_columns = []
+        for col in (self.dataset_config.source_sequences or []) + (
+            self.dataset_config.target_sequences or []
+        ):
+            candidate_columns.append(col.text_column)
+            candidate_columns.append(col.sonar_column)
+        return [x for x in candidate_columns if x is not None]
+    def _get_minimal_columns(self):
+        # restrict on used collumns
+        candidate_columns = [
+            self.dataset_config.source_column,
+            self.dataset_config.source_text_column,
+            self.dataset_config.source_quality_column,
+            self.dataset_config.target_column,
+            self.dataset_config.target_text_column,
+            "split",
+        ] + self._get_sequences_columns()
+        minimal_columns: List[str] = [
+            x
+            for x in candidate_columns
+            if x is not None and x in self.full_schema.names
+        ]
+        if self.dataset_config.columns is None:
+            columns = sorted(set(minimal_columns))
+        else:
+            columns = sorted(set(minimal_columns + list(self.dataset_config.columns)))
+        if not set(columns).issubset(set(self.full_schema.names)):
+            raise ValueError(
+                f"columns {sorted(set(columns) - set(self.full_schema.names))} are not found in the dataset schema"
+            )
+        return columns
+    def _get_suffix_prefix_vector(self):
+        nested_result = prepare_suffix_prefix_embeddings(
+            self.dataset_config.source_prefix_text,
+            self.dataset_config.source_suffix_text,
+            self.dataset_config.target_prefix_text,
+            self.dataset_config.target_suffix_text,
+        )
+        names = (
+            ("source_prefix_vector", "source_prefix_sentences"),
+            ("source_suffix_vector", "source_suffix_sentences"),
+            ("target_prefix_vector", "target_prefix_sentences"),
+            ("target_suffix_vector", "target_suffix_sentences"),
+        )
+        return {n: v for nn, val in zip(names, nested_result) for n, v in zip(nn, val)}
+    def get_fragments_pipeline(self):
+        split_to_row_groups = self.dataset_config.split_to_row_groups
+        assert isinstance(split_to_row_groups, bool)
+        # one can use `list_parquet_fragments` for a full fragments scan
+        fragments_pipeline_builder = stream_parquet_fragments(
+            parquet_ds=self.dataset,
+            nb_epochs=self.loading_config.nb_epochs,
+            split_to_row_groups=split_to_row_groups,
+            shuffle=self.loading_config.shuffle,
+            seed=self.loading_config.seed,
+            limit_options=self.dataset_config.limit,
+            shuffling_window=20 * self.nb_parallel_fragments,
+        )
+        return fragments_pipeline_builder
+    def compute_stats(self, max_fragments=100) -> GlobalPQStats:
+        if self.dataset_config.source_sequences:
+            source_column = None
+        else:
+            source_column = self.dataset_config.source_column
+        split_to_row_groups = self.dataset_config.split_to_row_groups
+        columns = [source_column] if source_column else None
+        if (
+            self.dataset_config.limit is not None
+            and self.dataset_config.limit.nb_fragments is not None
+        ):
+            # TODO: take into account other limit options to get better estimates
+            max_fragments = min(self.dataset_config.limit.nb_fragments, max_fragments)
+        self._stats_df = get_row_group_level_metadata(
+            self.dataset, columns=columns, max_fragments=max_fragments
+        )
+        dim = 1
+        if source_column:
+            self._stats_df["num_tokens"] = self._stats_df[source_column].apply(
+                lambda x: x["num_values"]
+            )
+            type_source = self.full_schema.field(source_column).type
+            try:
+                dim = type_source.value_type.list_size
+                if not dim or dim < 0:
+                    dim = 1  # not a fixed vector size
+            except AttributeError:
+                logger.warning(f"source column {source_column} is not of list type")
+                if self.dataset_config.nb_parallel_fragments is None:
+                    logger.warning("you may need to provide `nb_parallel_fragments`")
+                dim = 1
+        if split_to_row_groups:
+            global_stats_df = self._stats_df
+        elif "num_tokens" in self._stats_df:
+            global_stats_df = self._stats_df.groupby("parquet_file_path").agg(
+                {"num_rows": "sum", "num_tokens": "sum"}
+            )
+        else:
+            global_stats_df = self._stats_df.groupby("parquet_file_path").agg(
+                {"num_rows": "sum"}
+            )
+        mean_len_frag = global_stats_df["num_rows"].mean()
+        if "num_tokens" in global_stats_df:
+            mean_num_tokens_frag = self._stats_df["num_tokens"].mean() / dim
+        else:
+            mean_num_tokens_frag = None
+        return GlobalPQStats(
+            len(global_stats_df),
+            mean_len_frag,
+            mean_fragment_number_of_tokens=mean_num_tokens_frag,
+        )
+    def add_inner_pipeline(self, pipeline: DataPipelineBuilder) -> DataPipelineBuilder:
+        loading_config = self.loading_config
+        columns_to_bucket = [
+            self.dataset_config.source_column,
+            self.dataset_config.target_column,
+        ]
+        columns_to_bucket = [x for x in columns_to_bucket if x is not None]
+        def inner_iterator(table: pa.Table) -> DataPipeline:
+            return build_batching_loop_over_one_table(
+                table=table,
+                order_by_length=self.loading_config.order_by_length,
+                length_column=columns_to_bucket,
+                batch_size=loading_config.batch_size,
+                max_tokens=loading_config.max_tokens,
+                shuffle=loading_config.shuffle,
+                seed=self.random_state.randint(0, 2**32),
+                num_parallel_calls=self._num_parallel_call(3),
+            )
+        return pipeline.yield_from(inner_iterator)
+    def _get_inner_seed(self, rank: int, sharding_in_memory: bool) -> Optional[int]:
+        if self.loading_config.seed is not None:
+            if not sharding_in_memory:
+                return int(self.loading_config.seed) + rank * 100_000
+            else:
+                # for `sharding_in_memory`, we want the same shuffling
+                # to guarantee the consistent sharding across ranks
+                return int(self.loading_config.seed)
+        else:
+            return None
+    def add_prefix_suffix_pipeline(
+        self,
+        pipeline: DataPipelineBuilder,
+        column: Optional[str],
+        prefix,
+        suffix,
+    ) -> DataPipelineBuilder:
+        if (suffix is None and prefix is None) or column is None:
+            return pipeline
+        pipeline = pipeline.map(
+            partial(
+                prefix_and_suffix_one_list_column,
+                column=column,
+                prefix_array=prefix,
+                suffix_array=suffix,
+            ),
+            num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+        )
+        return pipeline
+    def add_basic_fragment_loading_pipeline(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        def load_fn(safe_frag):
+            try:
+                return safe_frag.load(columns=self.dataset_config.columns)
+            except Exception as e:
+                logger.error(
+                    f"Error {e} occured while loading fragment {safe_frag} \n, skipping it"
+                )
+                return None
+        pipeline = pipeline.map(
+            load_fn,
+            num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+        )
+        pipeline = pipeline.filter(lambda table: bool(table is not None))
+        # we reapply the partition filters just in case of misusage
+        # but it should not change the performance
+        partition_filters = self.dataset_config.partition_filters
+        filters = self.dataset_config.filters
+        if partition_filters is not None and filters is not None:
+            full_filter = pa.compute.if_else(filters, partition_filters, False)
+        else:
+            full_filter = partition_filters if filters is None else filters
+        pipeline = pipeline.map(
+            partial(
+                apply_filter,
+                filters=full_filter,
+                drop_null=self.loading_config.drop_null,
+            )
+        )
+        pipeline = pipeline.filter(lambda table: bool(len(table) > 0))
+        pipeline = pipeline.prefetch(self._nb_prefetch(self.nb_parallel_fragments))
+        return pipeline
+    def filter_by_aligned_length(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        source_columns: List[str] = [
+            x
+            for x in (
+                self.dataset_config.source_column,
+                self.dataset_config.source_text_column,
+                self.dataset_config.source_quality_column,
+            )
+            if x is not None
+        ]
+        # filter out sample where number of sentences and number of sonar embeddings are not equal
+        # which should never happen normally
+        pipeline = pipeline.map(
+            partial(
+                filter_table_with_different_lengths,
+                columns=source_columns,
+            ),
+            num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+        )
+        pipeline = pipeline.filter(lambda table: bool(len(table) > 0))
+        target_columns: List[str] = [
+            x
+            for x in (
+                self.dataset_config.target_column,
+                self.dataset_config.target_text_column,
+            )
+            if x is not None
+        ]
+        pipeline = pipeline.map(
+            partial(
+                filter_table_with_different_lengths,
+                columns=target_columns,
+            ),
+            num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+        )
+        pipeline = pipeline.filter(lambda table: bool(len(table) > 0))
+        return pipeline
+    def add_wrapping_to_max_length_pipeline(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        len_to_wrap_long_seq = getattr(
+            self.loading_config, "len_to_wrap_long_seq", None
+        )
+        if len_to_wrap_long_seq is None:
+            return pipeline
+        columns_to_wrap: List[str] = [
+            x
+            for x in (
+                self.dataset_config.source_column,
+                self.dataset_config.source_text_column,
+                self.dataset_config.source_quality_column,
+            )
+            if x is not None
+        ]
+        if self.loading_config.packing:
+            method = return_none_on_failure(explode_table_with_fixed_length)
+            logger.info(
+                f"Wrapping to len_to_wrap_long_seq={len_to_wrap_long_seq} with fixed length (packing)"
+            )
+        else:
+            method = return_none_on_failure(explode_table_with_max_length)
+            logger.info(
+                f"Wrapping to len_to_wrap_long_seq={len_to_wrap_long_seq} with max length (without packing)"
+            )
+        pipeline = pipeline.map(
+            partial(
+                method,
+                columns=columns_to_wrap,
+                max_seq_len=len_to_wrap_long_seq,
+            ),
+            num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+        )
+        return pipeline.filter(lambda table: table is not None)
+    def add_min_max_sentence_len_in_doc_filter(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        if (
+            self.loading_config.max_sentence_len_in_doc
+            or self.loading_config.min_sentence_len_in_doc
+        ):
+            assert self.dataset_config.source_text_column is not None, (
+                f"Expexted a source_text_columns, found {self.dataset_config.source_text_column}"
+            )
+            pipeline = pipeline.map(
+                partial(
+                    filter_long_short_sentence_document,
+                    column=self.dataset_config.source_text_column,
+                    max_sentence_len=self.loading_config.max_sentence_len_in_doc,
+                    min_sentence_len=self.loading_config.min_sentence_len_in_doc,
+                ),
+                num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+            ).filter(lambda table: bool(len(table) > 0))
+        if self.dataset_config.target_column is not None and (
+            self.loading_config.max_sentence_len_in_target_doc
+            or self.loading_config.min_sentence_len_in_target_doc
+        ):
+            pipeline = pipeline.map(
+                partial(
+                    filter_long_short_sentence_document,
+                    column=self.dataset_config.target_column,
+                    max_sentence_len=self.loading_config.max_sentence_len_in_target_doc,
+                    min_sentence_len=self.loading_config.min_sentence_len_in_target_doc,
+                ),
+                num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+            ).filter(lambda table: bool(len(table) > 0))
+        return pipeline
+    def add_min_sentence_number_in_doc_filter(
+        self,
+        pipeline: DataPipelineBuilder,
+        min_source_length: Optional[int] = None,
+        min_target_length: Optional[int] = None,
+    ) -> DataPipelineBuilder:
+        """
+        If `min_source_length` is not None: filter the source to remove sequences
+            with less than `min_source_length` sentences
+        If `min_target_length` is not None and data comes with a target column:
+            filter the target to remove sequences with less than `min_target_length` sentences
+        """
+        def _min_length_filter(table, column, length):
+            filter_ = pc.greater_equal(pc.list_value_length(table[column]), length)
+            if pc.all(filter_).as_py():
+                return table
+            return table.filter(filter_)
+        if (
+            self.dataset_config.source_column is not None
+            and min_source_length is not None
+        ):
+            pipeline = pipeline.map(
+                partial(
+                    _min_length_filter,
+                    column=self.dataset_config.source_column,
+                    length=min_source_length,
+                ),
+                num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+            ).filter(lambda table: bool(len(table) > 0))
+        if (
+            self.dataset_config.target_column is not None
+            and min_target_length is not None
+        ):
+            pipeline = pipeline.map(
+                partial(
+                    _min_length_filter,
+                    column=self.dataset_config.target_column,
+                    length=min_target_length,
+                ),
+                num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+            ).filter(lambda table: bool(len(table) > 0))
+        return pipeline
+    def add_quality_score_filters(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        source_quality_range = self.dataset_config.source_quality_range
+        if source_quality_range is None:
+            return pipeline
+        assert self.dataset_config.source_quality_column is not None, (
+            f"Expected a source_quality_columns, found {self.dataset_config.source_quality_column}"
+        )
+        pipeline = pipeline.map(
+            partial(
+                filter_document_by_quality,
+                column=self.dataset_config.source_quality_column,
+                min_score=source_quality_range[0],
+                max_score=source_quality_range[1],
+            ),
+            num_parallel_calls=self._num_parallel_call(self.nb_parallel_fragments),
+        ).filter(lambda table: bool(len(table) > 0))
+        return pipeline
+    def add_format_conversion(
+        self, pipeline: DataPipelineBuilder
+    ) -> DataPipelineBuilder:
+        if self.loading_config.output_format == ParquetBatchFormat.pandas:
+            pipeline = pipeline.map(lambda table: table.to_pandas())
+        elif self.loading_config.output_format == ParquetBatchFormat.torch:
+            pipeline = pipeline.map(lambda wt: pyarrow_table_to_torch_dict(wt))
+        return pipeline
+    def get_python_iterator(
+        self, rank: int = 0, world_size: int = 1
+    ) -> Generator[BatchOutputType, None, None]:  # type: ignore
+        yield from iter(
+            self.build_dataload_pipeline(
+                rank=rank,
+                world_size=world_size,
+            )
+            .prefetch(self._nb_prefetch(5))
+            .and_return(max_num_warnings=4)
+        )
+def parquet_iterator(
+    dataset_config: ParquetDatasetConfig,
+    loading_config: DataLoadingConfig,
+    rank: int,
+    world_size: int,
+) -> Generator[BatchOutputType, None, None]:  # type: ignore
+    spdd = SingleParquetDatasetDataloader(dataset_config, loading_config)
+    yield from spdd.get_python_iterator(rank, world_size)
+def build_parquet_iterator_pipeline(
+    dataset_config: ParquetDatasetConfig,
+    loading_config: DataLoadingConfig,
+    rank: int = 0,
+    world_size: int = 1,
+) -> DataPipelineBuilder:
+    return SingleParquetDatasetDataloader(
+        dataset_config, loading_config
+    ).build_dataload_pipeline(rank=rank, world_size=world_size)
+def ds_name(conf: ParquetDatasetConfig) -> str:
+    if conf.name is not None:
+        return conf.name
+    return str(conf.parquet_path)
+def circular_shift_left(lst: List[Any], k: int) -> List[Any]:
+    if len(lst) <= 1:
+        return lst
+    k = k % len(lst)  # To handle shifts larger than the list length
+    return lst[k:] + lst[:k]
+def build_weighted_pipeline_with_renaming(
+    dataset_configs: Sequence[ParquetDatasetConfig],
+    loading_config: DataLoadingConfig,
+    rank: int = 0,
+    world_size: int = 1,
+) -> DataPipeline:
+    assert loading_config.multiple_dataset_chaining in [
+        "sample",
+        "concat",
+        "round_robin",
+    ]
+    # adjusting the number parallel calls and prefetch according to total number of datasets
+    dataset_configs = list(dataset_configs)
+    loading_config.num_parallel_calls = loading_config.num_parallel_calls / len(
+        dataset_configs
+    )
+    loading_config.nb_prefetch = loading_config.nb_prefetch // len(dataset_configs)
+    name_mappers = get_renaming_mappers(dataset_configs)
+    pipelines: List[DataPipelineBuilder] = []
+    def process_one_pipeline(cc, mapper):
+        return build_parquet_iterator_pipeline(
+            dataset_config=cc,
+            loading_config=loading_config,
+            rank=rank,
+            world_size=world_size,
+        ).map(
+            partial(renaming, mapper=mapper, name=ds_name(cc)),
+            num_parallel_calls=1,
+        )
+    # creating all datasets pipeline in parallel
+    pipelines = [
+        process_one_pipeline(cc, mapper)
+        for cc, mapper in zip(dataset_configs, name_mappers)
+    ]
+    if len(pipelines) == 1:
+        return (
+            pipelines[0]
+            .prefetch(int(max(loading_config.nb_prefetch, 1)))
+            .and_return(max_num_warnings=4)
+        )
+    if loading_config.seed is not None:
+        seed = loading_config.seed + (0 if loading_config.even_sharding else rank)
+    else:
+        seed = None
+    pipelines_with_return = [pp.and_return(max_num_warnings=4) for pp in pipelines]
+    if loading_config.multiple_dataset_chaining == "concat":
+        # TODO : check that all weights = 1
+        weighted_pipeline = DataPipeline.concat(
+            circular_shift_left(pipelines_with_return, k=rank),
+        )
+    elif loading_config.multiple_dataset_chaining == "round_robin":
+        weighted_pipeline = DataPipeline.round_robin(
+            circular_shift_left(pipelines_with_return, k=rank), allow_repeats=False
+        )
+    else:
+        weighted_pipeline = DataPipeline.sample(
+            pipelines_with_return,
+            [getattr(cc, "weight", 1.0) for cc in dataset_configs],
+            seed=seed,
+        )
+    return weighted_pipeline.prefetch(
+        int(
+            max(loading_config.nb_prefetch * len(dataset_configs) ** 2, 1)
+        )  # try to prefetch at least one element from each dataset
+    ).and_return(max_num_warnings=4)

lcm/datasets/parquet_utils.py ADDED Viewed

	@@ -0,0 +1,1141 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+import logging
+from dataclasses import dataclass
+from functools import lru_cache, reduce, wraps
+from pickle import dumps, loads
+from typing import Any, Iterator, List, Optional, Union
+import numpy as np
+import pandas as pd
+import polars as pl
+import pyarrow as pa
+import pyarrow.compute as pc
+import pyarrow.parquet as pq
+import torch
+from fairseq2.data.data_pipeline import (
+    DataPipeline,
+    DataPipelineBuilder,
+    read_iterator,
+    read_sequence,
+)
+from fairseq2.data.parquet.tools import (
+    NestedDict,
+    NestedDictValue,
+    add_partitioning_values,
+    compute_rows_length,
+    get_dataset_fragments,
+    split_fragment_in_row_groups,
+)
+from joblib import Parallel, delayed
+from numpy.typing import NDArray
+from pyarrow.dataset import get_partition_keys
+from retrying import retry
+from stopes.modules.preprocess.sonar_text_embedding import (
+    LangColumnConfig,
+    SonarTextBatchEmbedder,
+    SonarTextEmbedderConfig,
+)
+from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
+from stopes.utils.arrow_utils import (
+    hstack_pyarray_list,
+    is_list_like,
+    pyarrow_column_to_array,
+    simple_array_to_nested,
+)
+from tqdm.auto import tqdm
+from lcm.datasets.configs import (
+    ColumnsNames,
+    ParquetDatasetLimitOptions,
+    SonarTextColumn,
+)
+from lcm.utils.common import batched
+try:
+    from numba import njit
+except ModuleNotFoundError:
+    print("Numba is not installed. Fall-back to the non-recompiled version")
+    def empty_jit(f):
+        @wraps(f)
+        def _f(*args, **kwargs):
+            return f(*args, **kwargs)
+        return _f
+    njit = empty_jit
+loading_retry = retry(
+    retry_on_exception=lambda exception: isinstance(exception, OSError),
+    stop_max_attempt_number=1,
+    wait_exponential_multiplier=2,
+    wait_exponential_max=20,
+)
+logger = logging.getLogger(__name__)
+def prefix_and_suffix_one_list_column(
+    table: pa.Table, column: str, prefix_array: pa.Array, suffix_array: pa.Array
+):
+    prefix_extended = pa.chunked_array(
+        [pa.ListArray.from_arrays([0, len(prefix_array)], prefix_array)] * len(table)
+    )
+    suffix_extended = pa.chunked_array(
+        [pa.ListArray.from_arrays([0, len(suffix_array)], suffix_array)] * len(table)
+    )
+    target_dtype = table[column].type
+    if prefix_extended.type != target_dtype:
+        prefix_extended = prefix_extended.cast(target_dtype)
+    if suffix_extended.type != target_dtype:
+        suffix_extended = suffix_extended.cast(target_dtype)
+    new_array = hstack_pyarray_list(prefix_extended, table[column], suffix_extended)
+    return table.drop([column]).append_column(column, new_array)
+def define_parquet_dataset(parquet_path: str, partition_filters) -> pq.ParquetDataset:
+    return pq.ParquetDataset(
+        parquet_path,
+        filters=partition_filters,
+    )
+@lru_cache()
+def default_sonar_pipeline() -> SonarTextBatchEmbedder:
+    local_sonar_config = SonarTextEmbedderConfig(
+        column_config=[
+            LangColumnConfig("input_text", lang_value="eng_Latn"),
+        ],
+        batch_size=10,
+        device="cpu",
+    )
+    return SonarTextBatchEmbedder(local_sonar_config)
+@lru_cache(2000)
+def _get_embed_sentences(text: Optional[str]) -> pa.Array:
+    sentences_splitter = get_split_algo("eng_Latn", "default")
+    lstbe = default_sonar_pipeline()
+    sentences = pa.array(sentences_splitter(text) if text else [""])
+    input_table = pa.Table.from_pydict({"input_text": sentences})
+    vectors = pyarrow_column_to_array(lstbe(input_table)["input_text_sonar_emb"])
+    if not text:
+        # empty output of the right type
+        vectors = vectors.slice(0, 0)
+        sentences = sentences.slice(0, 0)
+    return vectors, sentences
+def prepare_suffix_prefix_embeddings(*args):
+    if all(xx is None for xx in args):  # to avoid loading SonarModel
+        return [(None, None) for _ in args]
+    return [_get_embed_sentences(xx) for xx in args]
+def from_pyarrow_to_torch_tensor(
+    arr: Union[pa.Array, pa.ChunkedArray], strict: bool = False
+) -> NestedDictValue:
+    """
+    struct_array = pa.Array.from_pandas([{"x": 4, "y": "RR"}] * 10)
+    nest_array = pa.Array.from_pandas([[{'a': 1}, {'a': 2}]])
+    """
+    # for future ideas https://arrow.apache.org/docs/python/generated/pyarrow.Tensor.html
+    # for sparse matrix support https://github.com/apache/arrow/blob/main/python/pyarrow/tests/test_sparse_tensor.py
+    if arr.null_count != 0:
+        raise ValueError("to torch conversion does not support null values")
+    arr = pyarrow_column_to_array(arr)
+    arr_type = arr.type
+    if pa.types.is_primitive(arr_type):
+        try:
+            return torch.from_numpy(arr.to_numpy(zero_copy_only=True))
+        except Exception:
+            pass
+    try:
+        return torch.from_numpy(arr.to_numpy(zero_copy_only=True))
+    except pa.ArrowInvalid:
+        pass
+    if pa.types.is_dictionary(arr_type):
+        return from_pyarrow_to_torch_tensor(arr.dictionary_decode())
+    if pa.types.is_string(arr_type):
+        return arr.to_pandas().tolist()
+    if pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type):
+        if pa.types.is_primitive(arr_type.value_type):
+            return arr.to_pandas().map(torch.from_numpy).tolist()
+        if pa.types.is_fixed_size_list(arr_type.value_type) and pa.types.is_primitive(
+            arr_type.value_type.value_type
+        ):
+            return (
+                arr.to_pandas()
+                .map(
+                    lambda x: torch.from_numpy(
+                        np.vstack(x) if len(x) > 0 else np.array([], dtype=np.float32)
+                    )
+                )
+                .tolist()
+            )
+    if pa.types.is_fixed_size_list(arr_type):
+        if pa.types.is_primitive(arr_type.value_type):
+            return torch.from_numpy(np.reshape(arr.values, (-1, arr_type.list_size)))
+    if pa.types.is_struct(arr_type):
+        return {
+            arr_type.field(i).name: from_pyarrow_to_torch_tensor(arr.field(i))
+            for i in range(arr_type.num_fields)
+        }
+    if pa.types.is_nested(arr_type):
+        # TODO: deal with arr = [[{'a': 1}, {'a': 2}]]
+        pass
+    if strict:
+        raise NotImplementedError(f"{arr_type} cannot be converted to torch.Tensor")
+    else:
+        return arr  # keeping as in the orignal pyarrow form
+def pyarrow_table_to_torch_dict(tt: pa.Table, strict: bool = False) -> NestedDict:
+    out = {}
+    for col in tt.column_names:
+        try:
+            out[col] = from_pyarrow_to_torch_tensor(tt[col], strict)
+        except ValueError as e:
+            logger.info(
+                f"Column {col} of type {tt[col].type} was not converted to torch as expected",
+                str(e),
+            )
+            out[col] = tt[col]
+    return out
+def add_fragments_trace(table: pa.Table, fragment: pa.dataset.Fragment) -> pa.Table:
+    table = table.append_column(
+        "__row_groups_ids",
+        len(table)
+        * [np.array([int(rg.id) for rg in fragment.row_groups], dtype=np.int32)],
+    )
+    table = table.append_column(
+        "__index_in_fragement", pa.array(np.arange(len(table), dtype=np.int32))
+    )
+    return table
+def shuffle_table(table: pa.Table, random_state: np.random.RandomState) -> pa.Table:
+    permutation = pa.array(random_state.permutation(len(table)))
+    return table.take(permutation)
+class SafeFragment:
+    """
+    Experimental :
+    Simple wrapper around `ParquetFileFragment` that allows to reinit the state of filesystem
+    if aws session token has expired.
+    """
+    fragment: pa.dataset.ParquetFileFragment
+    def __init__(self, fragment: pa.dataset.ParquetFileFragment):
+        self.fragment = fragment
+    def __repr__(self) -> str:
+        out = ""
+        out += "SafeFragment \n"
+        out += "path = " + self.fragment.path + "\n"
+        out += f"row_groups = {[int(rg.id) for rg in self.fragment.row_groups]} \n"
+        out += f"physical_schema = \n {self.fragment.physical_schema} \n"
+        return out
+    @loading_retry
+    def load(self, columns: Optional[List[str]] = None) -> pa.Table:
+        if columns is not None:
+            fragment_columns = [
+                col for col in columns if col in self.fragment.physical_schema.names
+            ]
+        else:
+            fragment_columns = self.fragment.physical_schema.names
+        # adding technical columns for tracking
+        fragment_columns = list(fragment_columns) + [
+            "__batch_index",
+            "__fragment_index",
+            "__filename",
+        ]
+        try:
+            fragment_table = self.fragment.to_table(
+                columns=fragment_columns, use_threads=False
+            )
+        except OSError as e:
+            logger.info(
+                "could not load fragment, reinit the fragment state. Error: ", str(e)
+            )
+            self.fragment = loads(dumps(self.fragment))
+            fragment_table = self.fragment.to_table(
+                columns=fragment_columns, use_threads=False
+            )
+        fragment_table = add_partitioning_values(fragment_table, self.fragment, columns)
+        fragment_table = add_fragments_trace(fragment_table, self.fragment)
+        return fragment_table
+def _parquet_fragments_to_pipeline_builder(
+    file_ds_fragments: List[pa.dataset.Fragment],
+    nb_epochs: int = 1,
+    shuffle: bool = True,
+    seed: Optional[int] = None,
+) -> DataPipelineBuilder:
+    if shuffle:
+        if seed is None:
+            seed = int(torch.randint(0, 2**31, ()).item())
+        rsg = np.random.RandomState(seed)
+        ds_fragments_ = np.asarray(file_ds_fragments, dtype="O")
+        ds_fragments = np.concatenate(
+            [rsg.permutation(ds_fragments_) for _ in range(nb_epochs)]
+        ).tolist()
+    else:
+        ds_fragments = file_ds_fragments * nb_epochs
+    pipeline_builder = read_sequence(ds_fragments)
+    pipeline_builder = pipeline_builder.map(SafeFragment)
+    return pipeline_builder
+def list_parquet_fragments(
+    parquet_ds: pq.ParquetDataset,
+    nb_epochs: int = 1,
+    split_to_row_groups: bool = True,
+    shuffle: bool = True,
+    seed: Optional[int] = None,
+    limit_options: Optional[ParquetDatasetLimitOptions] = None,
+    nb_jobs: int = 10,
+) -> DataPipelineBuilder:
+    if limit_options is None:
+        limit_options = ParquetDatasetLimitOptions()
+    file_ds_fragments = get_dataset_fragments(parquet_ds, parquet_ds._filter_expression)
+    proxy_ds_path = "/".join(parquet_ds.files[0].split("=")[0].split("/")[:-1])
+    logger.info(f"{proxy_ds_path} : full number of files {len(file_ds_fragments)}")
+    if limit_options.fraction_of_files is not None:
+        file_ds_fragments = file_ds_fragments[
+            : max(
+                int(round(limit_options.fraction_of_files * len(file_ds_fragments))), 1
+            )
+        ]
+        logger.info(
+            f"{proxy_ds_path} : reducing number of files to {len(file_ds_fragments)} because of fraction_of_files={limit_options.fraction_of_files}"
+        )
+    if limit_options.nb_files is not None and limit_options.nb_files < len(
+        file_ds_fragments
+    ):
+        file_ds_fragments = file_ds_fragments[: limit_options.nb_files]
+        logger.info(
+            f"{proxy_ds_path} : reducing number of files to {len(file_ds_fragments)} because of nb_files={limit_options.nb_files}"
+        )
+    output_fragments = []
+    total_nb_rows = 0
+    if split_to_row_groups:
+        logger.info(f"{proxy_ds_path} : starting split in row groups")
+        with Parallel(backend="threading", n_jobs=nb_jobs) as parallel:
+            total_nb_fragments = 0
+            early_stop = False
+            for batch_of_files in batched(file_ds_fragments, 20 * nb_jobs):
+                row_groups = parallel(
+                    delayed(split_fragment_in_row_groups)(ff) for ff in batch_of_files
+                )
+                new_file_fragments = [x for y in row_groups for x in y]
+                if limit_options.nb_rows is not None:
+                    new_file_fragments_stats = parallel(
+                        delayed(lambda frag: frag.row_groups[0].num_rows)(ff)
+                        for ff in new_file_fragments
+                    )
+                else:
+                    new_file_fragments_stats = [0] * len(new_file_fragments)
+                for nb_row, frag in zip(new_file_fragments_stats, new_file_fragments):
+                    output_fragments.append(frag)
+                    total_nb_rows += nb_row
+                    total_nb_fragments += 1
+                    if (
+                        limit_options.nb_fragments is not None
+                        and total_nb_fragments >= limit_options.nb_fragments
+                    ):
+                        early_stop = True
+                        if limit_options.nb_rows is not None:
+                            logger.info(
+                                f"{proxy_ds_path} : nb_fragments limit {limit_options.nb_fragments} was reached with around {total_nb_rows} rows"
+                            )
+                        else:
+                            logger.info(
+                                f"{proxy_ds_path} : nb_fragments limit {limit_options.nb_fragments} was reached"
+                            )
+                        break
+                    if (
+                        limit_options.nb_rows is not None
+                        and total_nb_rows >= limit_options.nb_rows
+                    ):
+                        early_stop = True
+                        logger.info(
+                            f"{proxy_ds_path} : nb_rows limit {limit_options.nb_rows} was reached with around {total_nb_fragments} fragments"
+                        )
+                        break
+                if early_stop:
+                    break
+    else:
+        for frag in file_ds_fragments[: limit_options.nb_fragments]:
+            output_fragments.append(frag)
+            if limit_options.nb_rows is not None:
+                total_nb_rows += frag.count_rows()
+                if total_nb_rows >= limit_options.nb_rows:
+                    break
+    logger.info(f"{proxy_ds_path} : finding fragments {len(output_fragments)}")
+    return _parquet_fragments_to_pipeline_builder(
+        output_fragments,
+        nb_epochs=nb_epochs,
+        shuffle=shuffle,
+        seed=seed,
+    )
+def compute_length_splits(
+    length_col: NDArray[np.int32],
+    max_tokens: int,
+    order_by_length: bool = True,
+    drop_long_sample: bool = True,
+) -> List[NDArray[np.int32]]:
+    """split sequence of length_col in the chunks such that total length is ~ max_tokens
+        countint the padding to max length of elements in a chunk
+    Args:
+        length_col (np.ndarray):
+        max_tokens (int):
+        order_by_length (bool):
+        drop_long_sample (bool):
+    Returns:
+        List[np.ndarray]: splits that contain indices over the original length_col
+    """
+    argsort_ind = (
+        np.argsort(length_col)
+        if order_by_length
+        else np.arange(len(length_col), dtype=np.int32)
+    )
+    sorted_length_col = length_col[argsort_ind]
+    small_elements_masks = sorted_length_col <= max_tokens
+    big_elements_inds = argsort_ind[~small_elements_masks]
+    argsort_ind = argsort_ind[small_elements_masks]
+    sorted_length_col = sorted_length_col[small_elements_masks]
+    size = len(sorted_length_col)
+    splits = []
+    begin, end = 0, 0
+    while end < size:
+        current_max_len = sorted_length_col[begin]
+        begin = end
+        while end < size:
+            current_max_len = max(current_max_len, sorted_length_col[end])
+            if current_max_len * (end + 1 - begin) > max_tokens:
+                splits.append(argsort_ind[begin:end])
+                break
+            end += 1
+    else:
+        if begin < size:
+            splits.append(argsort_ind[begin:])
+    # adding big sample at the end one by one
+    if not drop_long_sample and len(big_elements_inds):
+        splits.extend(np.array_split(big_elements_inds, len(big_elements_inds)))
+    return splits
+def build_batching_loop_over_one_table(
+    table: pa.Table,
+    order_by_length: bool = False,
+    length_column: List[Optional[str]] = None,
+    batch_size: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    shuffle: bool = True,
+    seed: Optional[int] = None,
+    num_parallel_calls: int = 1,
+) -> DataPipeline:
+    if max_tokens is not None:
+        assert length_column is not None, (
+            "Need to provide a column to compute the number of tokens"
+        )
+    random_state = np.random.RandomState(seed)
+    if length_column is not None and len(length_column) > 0:
+        length_col = reduce(
+            np.add, (compute_rows_length(table[lc]) for lc in length_column)
+        )
+    else:
+        if shuffle:
+            length_col = random_state.randint(0, 2**23, len(table))
+        else:
+            length_col = np.zeros(len(table), dtype=np.int32)
+    if batch_size is not None:
+        if order_by_length:
+            sorting_ind = np.argsort(length_col, kind="stable")
+        else:
+            sorting_ind = np.arange(len(length_col), dtype=np.int32)
+        order_tt = pa.Table.from_arrays([pa.array(sorting_ind)], ["order"])
+        batches = [ind["order"] for ind in order_tt.to_batches(batch_size)]
+    elif max_tokens is not None:
+        batches = compute_length_splits(
+            length_col, max_tokens, order_by_length=order_by_length
+        )
+    else:
+        raise ValueError("unknown batching method")
+    if shuffle:
+        batches = [batches[i] for i in random_state.permutation(len(batches))]
+    def _getter(ind):
+        try:
+            tt = table.take(ind)
+            return tt
+        except Exception as e:
+            logger.warn(f"Unexpected error : \n {str(e)} \n {table} \n {ind}")
+            return None
+    return (
+        read_sequence(batches)
+        .map(_getter, num_parallel_calls=num_parallel_calls)
+        .filter(lambda tt: bool(tt is not None))
+        .and_return(max_num_warnings=4)
+    )
+def filter_long_short_sentence_document(
+    batch: pa.Table,
+    column: str,
+    max_sentence_len: Optional[int],
+    min_sentence_len: Optional[int],
+) -> pa.Table:
+    assert max_sentence_len is not None or min_sentence_len is not None
+    if min_sentence_len is None:
+        min_sentence_len = 0
+    if max_sentence_len is None:
+        max_sentence_len = 2**32
+    tt = pl.from_arrow(batch.select([column]), rechunk=False)
+    assert isinstance(tt, pl.DataFrame)
+    filter_ = tt.with_columns(
+        (
+            pl.col(column).list.eval(pl.col("").str.len_bytes()).list.max()
+            <= max_sentence_len
+        )
+        & (
+            pl.col(column).list.eval(pl.col("").str.len_bytes()).list.min()
+            <= max_sentence_len
+        )
+    )[column].to_arrow()
+    if pc.all(filter_).as_py():
+        return batch
+    return batch.filter(filter_)
+def filter_document_by_quality(
+    batch: pa.Table,
+    column: str,
+    min_score=Optional[float],
+    max_score=Optional[float],
+) -> pa.Table:
+    if min_score is None and max_score is None:
+        return batch
+    if min_score is None:
+        min_score = -float(np.inf)
+    if max_score is None:
+        max_score = float(np.inf)
+    tt = pl.from_arrow(batch.select([column]), rechunk=False)
+    assert isinstance(tt, pl.DataFrame)
+    filter_ = tt.with_columns(
+        (pl.col(column).list.max() <= max_score)
+        & (pl.col(column).list.min() >= min_score)
+    )[column].to_arrow()
+    if pc.all(filter_).as_py():
+        return batch
+    return batch.filter(filter_)
+def renaming(inp: NestedDict, mapper: dict, name: str) -> NestedDict:
+    renamed_name = ColumnsNames.dataset_name.value
+    if isinstance(inp, dict):
+        out_dict = {mapper.get(key, key): value for key, value in inp.items()}
+        out_dict[renamed_name] = name
+        res = out_dict
+    elif isinstance(inp, pd.DataFrame):
+        out_pd = inp.rename(mapper=mapper, axis=1)
+        out_pd[renamed_name] = name
+        res = out_pd
+    elif isinstance(inp, pa.Table):
+        out_pa: pa.Table = inp.rename_columns(
+            [mapper.get(key, key) for key in inp.column_names],
+        )
+        out_pa = out_pa.append_column(renamed_name, pa.array([name] * len(out_pa)))
+        res = out_pa
+    return res
+def materialize_sequence(
+    table: pa.Table,
+    column_sequence: List[SonarTextColumn],
+    vector_name: str,
+    text_name: str,
+) -> pa.Table:
+    """
+    Given `table`, it materializes `column_sequence`.
+    Different elements from `column_sequence` are concatenated sequentially.
+    Constant text elements will be sentencized and sonarized.
+    It also accepts columns with single text and embeddings values instead of list.
+    It returns a new table with two new columns with sequences of sentences and corresponding sequences of their embeddings.
+    """
+    table_len = len(table)
+    sentences_seq = []
+    vectors_seq = []
+    target_dtype = None
+    for col in column_sequence:
+        if col.sonar_column is not None:
+            target_dtype = table[col.sonar_column].type
+            break
+    for col in column_sequence:
+        if col.text_value is not None:
+            vectors, sentences = _get_embed_sentences(col.text_value)
+            vectors_extended = pa.chunked_array(
+                [pa.ListArray.from_arrays([0, len(vectors)], vectors)] * table_len
+            )
+            sentences_extended = pa.chunked_array(
+                [pa.ListArray.from_arrays([0, len(sentences)], sentences)] * table_len
+            )
+        else:
+            assert (col.text_column is not None) and (col.sonar_column is not None)
+            vectors_extended = table[col.sonar_column]
+            sentences_extended = table[col.text_column]
+            if is_list_like(vectors_extended):
+                assert is_list_like(sentences_extended)
+            else:
+                vectors_extended = simple_array_to_nested(vectors_extended)
+                sentences_extended = simple_array_to_nested(sentences_extended)
+        if target_dtype and vectors_extended.type != target_dtype:
+            vectors_extended = vectors_extended.cast(target_dtype)
+        vectors_seq.append(vectors_extended)
+        sentences_seq.append(sentences_extended)
+    new_vectors_array = hstack_pyarray_list(*vectors_seq)
+    new_sentences_array = hstack_pyarray_list(*sentences_seq)
+    del vectors_seq, sentences_seq
+    table = table.append_column(vector_name, new_vectors_array)
+    table = table.append_column(text_name, new_sentences_array)
+    return table
+@njit
+def _get_hierarchical_indices_and_offsets(
+    pagaraphs_lengths: List[np.ndarray], max_seq_len: int
+):
+    indices = []
+    new_lens = [0]
+    hierarchy_new_lens = [0]
+    for i, current_lens in enumerate(pagaraphs_lengths):
+        tmp_lens_sum = 0
+        nb_blocks = 0
+        for ll in current_lens:
+            if ll + tmp_lens_sum > max_seq_len:
+                indices.append(i)
+                new_lens.append(new_lens[-1] + tmp_lens_sum)
+                hierarchy_new_lens.append(hierarchy_new_lens[-1] + nb_blocks)
+                tmp_lens_sum = ll
+                nb_blocks = 0
+            else:
+                tmp_lens_sum += ll
+            nb_blocks += 1
+        if nb_blocks > 0:
+            indices.append(i)
+            new_lens.append(new_lens[-1] + tmp_lens_sum)
+            hierarchy_new_lens.append(hierarchy_new_lens[-1] + nb_blocks)
+    return (
+        np.array(indices, dtype=np.int32),
+        np.array(new_lens, dtype=np.int32),
+        np.array(hierarchy_new_lens, dtype=np.int32),
+    )
+def hierarchical_explode_table_with_max_length(
+    table: pa.Table,
+    columns: Union[str, List[str]],
+    max_seq_len: int,
+    page_len_column: str,
+    page_embs_columns: Optional[Union[str, List[str]]],
+) -> pa.Table:
+    if isinstance(columns, str):
+        columns = [columns]
+    if isinstance(page_embs_columns, str):
+        page_embs_columns = [page_embs_columns]
+    elif page_embs_columns is None:
+        page_embs_columns = []
+    assert len(columns) > 0
+    cols = [pc.fill_null(table[columns[0]], [None])]
+    lengths = pc.list_value_length(cols[0]).to_numpy()
+    for name in columns[1:]:
+        col = pc.fill_null(table[name], [None])
+        # checking that all columns list structures are parallel
+        assert (lengths == pc.list_value_length(col).to_numpy()).all()
+        cols.append(col)
+    pagaraphs_lengths = table[page_len_column].to_pandas().to_list()
+    # assert [x.sum() for x pagaraphs_lengths] == lengths.tolist()
+    # next unroll with max_seq_len
+    indices, new_offests, hierarchy_offsets = _get_hierarchical_indices_and_offsets(
+        pagaraphs_lengths, max_seq_len
+    )
+    other_columns = list(table.schema.names)
+    for name in set(columns + [page_len_column] + page_embs_columns):
+        other_columns.remove(name)
+    remaining_table = table.select(other_columns).take(indices)
+    result_dict = {}
+    for name in other_columns:
+        result_dict[name] = remaining_table[name]
+    for name, col in zip(columns, cols):
+        rolled_array = pa.ListArray.from_arrays(
+            offsets=new_offests,
+            values=pyarrow_column_to_array(pc.list_flatten(col)),
+        )
+        result_dict[name] = rolled_array
+    for name in set([page_len_column] + page_embs_columns):
+        col = table[name]
+        rolled_array = pa.ListArray.from_arrays(
+            offsets=hierarchy_offsets,
+            values=pyarrow_column_to_array(pc.list_flatten(col)),
+        )
+        result_dict[name] = rolled_array
+    return pa.Table.from_pydict(result_dict, schema=table.schema)
+def filter_table_with_different_lengths(
+    table: pa.Table, columns: List[str]
+) -> pa.Table:
+    if len(columns) <= 1 or not all(is_list_like(table[col]) for col in columns):
+        return table
+    ref_lengths = pc.list_value_length(table[columns[0]])
+    for col in columns[1:]:
+        same_lens = pc.equal(pc.list_value_length(table[col]), ref_lengths)
+        if pc.all(same_lens).as_py():
+            continue
+        else:
+            logger.warn(
+                f"filtering table whose nb sentences and nb sonar vectors are aligned, keeping {pc.sum(same_lens).as_py()} rows out of{len(table)}"
+            )
+            table = table.filter(same_lens)
+    return table
+@dataclass
+class PFSState:
+    nb_fully_read_files: int = 0
+    nb_current_file_read_fragements: int = 0
+    total_nb_fragments: int = 0
+    total_nb_rows: int = 0
+class ParquetFragmentStreamer:
+    def __init__(
+        self,
+        parquet_ds: pq.ParquetDataset,
+        split_to_row_groups: bool = True,
+        limit_options: Optional[ParquetDatasetLimitOptions] = None,
+        read_state: Optional[PFSState] = None,
+    ):
+        self.split_to_row_groups = split_to_row_groups
+        self.limit_options = limit_options or ParquetDatasetLimitOptions()
+        self.parquet_ds = parquet_ds
+        if read_state is not None:
+            self.state = read_state
+        else:
+            self.reset_state()
+    def reset_state(self):
+        self.state = PFSState()
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.parquet_ds,
+                self.split_to_row_groups,
+                self.limit_options,
+                self.state,
+            ),
+        )
+    def truncate_files(
+        self,
+        parquet_ds: pq.ParquetDataset,
+        fraction_of_files: Optional[float],
+        nb_files: Optional[int],
+    ) -> List[pa.dataset.Fragment]:
+        file_ds_fragments = get_dataset_fragments(
+            parquet_ds, parquet_ds._filter_expression
+        )
+        self.proxy_ds_path = "/".join(parquet_ds.files[0].split("=")[0].split("/")[:-1])
+        logger.info(
+            f"{self.proxy_ds_path} : full number of files {len(file_ds_fragments)}"
+        )
+        if fraction_of_files is not None:
+            file_ds_fragments = file_ds_fragments[
+                : max(
+                    int(round(fraction_of_files * len(file_ds_fragments))),
+                    1,
+                )
+            ]
+            logger.info(
+                f"{self.proxy_ds_path} : reducing number of files to {len(file_ds_fragments)} because of fraction_of_files={fraction_of_files}"
+            )
+        if nb_files is not None and nb_files < len(file_ds_fragments):
+            file_ds_fragments = file_ds_fragments[:nb_files]
+            logger.info(
+                f"{self.proxy_ds_path} : reducing number of files to {len(file_ds_fragments)} because of nb_files={nb_files}"
+            )
+        return file_ds_fragments
+    def __iter__(self):
+        limit_options = self.limit_options
+        file_ds_fragments = self.truncate_files(
+            self.parquet_ds,
+            limit_options.fraction_of_files,
+            limit_options.nb_files,
+        )
+        if not self.split_to_row_groups:
+            for frag in file_ds_fragments[
+                self.state.nb_fully_read_files : limit_options.nb_fragments
+            ]:
+                self.state.nb_fully_read_files += 1
+                yield frag
+                if limit_options.nb_rows is not None:
+                    self.state.total_nb_rows += frag.count_rows()
+                    if self.state.total_nb_rows >= limit_options.nb_rows:
+                        break
+        else:
+            early_stop = False
+            logger.info(f"{self.proxy_ds_path} : starting split in row groups")
+            for new_file in file_ds_fragments[self.state.nb_fully_read_files :]:
+                new_file_fragments = split_fragment_in_row_groups(new_file)
+                new_file_fragments = new_file_fragments[
+                    self.state.nb_current_file_read_fragements :
+                ]
+                if limit_options.nb_rows is not None:
+                    new_file_fragments_stats = [
+                        frag.row_groups[0].num_rows for frag in new_file_fragments
+                    ]
+                else:
+                    new_file_fragments_stats = [0] * len(new_file_fragments)
+                for nb_row, frag in zip(new_file_fragments_stats, new_file_fragments):
+                    self.state.total_nb_rows += nb_row
+                    self.state.total_nb_fragments += 1
+                    self.state.nb_current_file_read_fragements += (
+                        1  # increate before yield
+                    )
+                    yield frag
+                    if (
+                        limit_options.nb_fragments is not None
+                        and self.state.total_nb_fragments >= limit_options.nb_fragments
+                    ):
+                        early_stop = True
+                        if limit_options.nb_rows is not None:
+                            logger.info(
+                                f"{self.proxy_ds_path} : nb_fragments limit {limit_options.nb_fragments} was reached with around {self.state.total_nb_rows} rows"
+                            )
+                        else:
+                            logger.info(
+                                f"{self.proxy_ds_path} : nb_fragments limit {limit_options.nb_fragments} was reached"
+                            )
+                        break
+                    if (
+                        limit_options.nb_rows is not None
+                        and self.state.total_nb_rows >= limit_options.nb_rows
+                    ):
+                        early_stop = True
+                        logger.info(
+                            f"{self.proxy_ds_path} : nb_rows limit {limit_options.nb_rows} was reached with around {self.state.total_nb_fragments} fragments"
+                        )
+                        break
+                if early_stop:
+                    break
+                # only when full file is read we increament this
+                self.state.nb_fully_read_files += 1
+                self.state.nb_current_file_read_fragements = 0
+@dataclass
+class ShuffledIteratorState:
+    epoch_count: int
+    current_window: List[Any]
+    index: int
+    random_state: np.random.RandomState
+class ShuffledIterator(Iterator[Any]):
+    def __init__(
+        self,
+        iterator,
+        window_size: int,
+        nb_epoch: int,
+        seed: Optional[int],
+        state: Optional[ShuffledIteratorState] = None,
+    ):
+        self.base_iterator = iterator
+        self.window_size = window_size
+        self.seed = seed
+        self.nb_epoch = nb_epoch
+        if state is None:
+            state = ShuffledIteratorState(
+                random_state=np.random.RandomState(self.seed),
+                epoch_count=0,
+                current_window=[],
+                index=0,
+            )
+        self.state = state
+        self.window_iterator = None
+    def reset_state(self):
+        self.state.random_state = np.random.RandomState(self.seed)
+        self.state.epoch_count = 0
+        self._reset_inner()
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.base_iterator,
+                self.window_size,
+                self.nb_epoch,
+                self.seed,
+                self.state,
+            ),
+        )
+    def _reset_inner(self):
+        self.base_iterator.reset_state()
+        self.state.index = 0
+        self.state.current_window = []
+        self.window_iterator = None
+    def __iter__(self):
+        return self
+    def __next__(self) -> Any:
+        if self.state.epoch_count >= self.nb_epoch:
+            raise StopIteration
+        # If current window is exhausted, fetch the next window
+        if self.window_iterator is None:
+            self.window_iterator = batched(self.base_iterator, self.window_size)  # type: ignore
+        assert self.window_iterator is not None
+        if self.state.index >= len(self.state.current_window):
+            try:
+                # Get the next window batch
+                window = next(self.window_iterator)
+                window = np.array(window, dtype="O")
+                self.state.random_state.shuffle(window)
+                self.state.current_window = window
+                self.state.index = 0
+            except StopIteration:
+                # If no more batches, increment epoch count and reset iterator
+                self.state.epoch_count += 1
+                self._reset_inner()
+                return self.__next__()
+        # Return the next element from the current window
+        result = self.state.current_window[self.state.index]
+        self.state.index += 1
+        return result
+def stream_parquet_fragments(
+    parquet_ds: pq.ParquetDataset,
+    nb_epochs: int,
+    split_to_row_groups: bool = True,
+    shuffle: bool = True,
+    seed: Optional[int] = None,
+    limit_options: Optional[ParquetDatasetLimitOptions] = None,
+    shuffling_window: int = 200,
+) -> DataPipelineBuilder:
+    fragments_iterator = ParquetFragmentStreamer(
+        parquet_ds=parquet_ds,
+        split_to_row_groups=split_to_row_groups,
+        limit_options=limit_options,
+    )
+    def reset_fn(iterator):
+        iterator.reset_state()
+        return iterator
+    pipeline = read_iterator(
+        ShuffledIterator(
+            fragments_iterator,
+            window_size=shuffling_window if shuffle else 1,
+            nb_epoch=nb_epochs,
+            seed=seed,
+        ),
+        reset_fn,
+        infinite=False,
+    )
+    return pipeline.map(SafeFragment)
+def get_row_group_level_metadata(
+    dataset: pq.ParquetDataset,
+    columns: Optional[List[str]] = None,
+    nb_jobs: int = 40,
+    max_fragments: int = -1,
+    seed: int = 123,
+) -> pd.DataFrame:
+    """
+    Parses row group level metadata from a Parquet dataset and returns it as a pandas DataFrame.
+    It's similar to `get_parquet_dataset_metadata`
+    but present a unnested view on row groups statistics for only a subset of columns.
+    This function can be used for any kind of downstream analysis.
+    It uses joblib for parallel processing
+    and tqdm for progress tracking, which are good practices for handling large datasets.
+    Parameters:
+    - dataset (pq.ParquetDataset): The Parquet dataset to parse.
+    - columns (list of str, optional): The columns to include in the output DataFrame. If not specified, all columns are included.
+                For `columns=[]` no column-vise information will be profided (which is generally much faster).
+    - nb_jobs (int, default=40): The number of parallel jobs to run.
+    - max_fragments (int, default=-1): The maximum number of fragments to include. If -1, all fragments are included.
+    - seed (int, default=123): The seed for the random number generator, used when selecting fragments.
+    Returns:
+    - pd.DataFrame: A DataFrame containing the row group level metadata.
+    Example:
+        >>> import pyarrow as pa
+        >>> import pyarrow.fs
+        >>> import pyarrow.compute as pc
+        >>> fs, parquet_uri = pa.fs.FileSystem.from_uri("s3://<bucket_name>/<dataset_name>/")
+        >>> dataset = pq.ParquetDataset(parquet_uri, filesystem=fs, filters=pc.equal(pc.field("split"), "validation"))
+        >>> df_stats = get_row_group_level_metadata(dataset, columns=["col1", "col2", ...])
+    """
+    assert max_fragments >= -1
+    fragments = list(dataset._dataset.get_fragments(filter=dataset._filter_expression))
+    if max_fragments != -1 and max_fragments < len(fragments):
+        fragments = (
+            np.random.RandomState(seed)
+            .choice(np.array(fragments, dtype="O"), max_fragments, replace=False)
+            .tolist()
+        )
+    physical_schema = fragments[0].physical_schema
+    columns = columns if columns is not None else physical_schema.names
+    # taking only existing columns
+    non_existing_columns = tuple(set(columns) - set(physical_schema.names))
+    if non_existing_columns:
+        print(
+            "Following colums are not present in physical schema and will be ignored",
+            non_existing_columns,
+        )
+    columns = [col for col in columns if col in physical_schema.names]
+    columns_index = [physical_schema.get_field_index(col) for col in columns]
+    columns_to_exclude = set(["row_group_id", "num_rows", "total_byte_size"]) & set(
+        columns
+    )
+    assert len(columns_to_exclude) == 0, (
+        f"names conflict, rename/remove : {columns_to_exclude}"
+    )
+    def get_one_row_group_stats(row_group):
+        metadata = row_group.metadata
+        info = {
+            "row_group_id": row_group.id,
+            "num_rows": metadata.num_rows,
+            "total_byte_size": metadata.total_byte_size,
+        }
+        for col, ind in zip(columns, columns_index):
+            info[col] = metadata.column(ind).to_dict()
+        return info
+    def get_fragment_stats(frag):
+        return {
+            "rg_stats": list(map(get_one_row_group_stats, frag.row_groups)),
+            "parquet_file_path": frag.path,
+            **get_partition_keys(frag.partition_expression),
+        }
+    stats = Parallel(nb_jobs, backend="threading")(
+        delayed(get_fragment_stats)(frag) for frag in tqdm(fragments)
+    )
+    stats = pd.DataFrame(stats).explode("rg_stats")
+    flatten_row_df = pd.DataFrame(stats.pop("rg_stats").tolist(), index=stats.index)
+    result_df = pd.concat([stats, flatten_row_df], axis=1)
+    return result_df

lcm/datasets/sentence_splitter_pipeline.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import gc
+import typing as tp
+from builtins import enumerate
+from dataclasses import dataclass, field
+import numba
+import numpy as np
+import polars as pl
+import pyarrow as pa
+import pyarrow.compute as pc
+import torch
+from stopes.modules.partitioned_data_mapper import BatchMapper
+from stopes.modules.preprocess.sonar_text_embedding import (
+    SonarTextBatchEmbedder,
+    SonarTextEmbedderConfig,
+)
+from stopes.utils.arrow_utils import (
+    apply_on_nested_array,
+)
+from wtpsplit import SaT, indices_to_sentences
+from lcm.datasets.sentence_splitting import remove_emojis, resplit
+@numba.jit(nopython=True)
+def insert_elements(arr, max_diff):
+    """
+    Insert elements into an array to ensure no two consecutive elements have a difference greater than max_diff.
+    Parameters:
+    arr (numpy array): The original array of integers.
+    max_diff (int): The maximum allowed difference between consecutive elements after insertion.
+    Returns:
+    numpy array: The modified array with additional elements inserted to satisfy the max_diff condition.
+    """
+    result = []
+    for i in range(len(arr) - 1):
+        result.append(arr[i])
+        diff = arr[i + 1] - arr[i]
+        if diff > max_diff:
+            num_insert = int(diff // max_diff)
+            step_size = diff / (num_insert + 1)
+            last_val = arr[i]
+            for j in range(1, num_insert + 1):
+                val = round(last_val + step_size)
+                if val < arr[i + 1]:
+                    result.append(val)
+                    last_val = val
+    result.append(arr[-1])
+    return np.array(result, dtype=np.int32)
+@numba.jit(nopython=True)
+def merge_small_intervals(
+    lenghts: np.ndarray, min_merging_length: int = 2, max_merge_length: int = 15
+):
+    """
+    Merge small intervals in a list of lengths.
+    This function takes a list of lengths and merges any intervals that are smaller than or equal to `min_merging_length`
+    into larger intervals. The merged intervals are limited to a maximum length of `max_merge_length`.
+    Parameters:
+    lengths (np.ndarray): A list of lengths to be merged.
+    min_merging_length (int): The minimum length of an interval to be merged. Defaults to 2.
+    max_merge_length (int): The maximum length of a merged interval. Defaults to 15.
+    Returns:
+    list: A list of merged lengths.
+    Examples:
+    >>> merge_small_intervals(np.array([1, 2, 3, 4, 5]))
+    array([3, 3, 4, 5], dtype=int32)
+    >>> merge_small_intervals(np.array([1, 1, 1, 1, 1]))
+    array([5], dtype=int32)
+    >>> merge_small_intervals(np.array([1, 2, 3, 2, 2, 2, 4, 1, 1, 5]))
+    array([3, 3, 6, 4, 2, 5], dtype=int32)
+    """
+    merge_arr = []
+    merge_len = 0
+    for curr_len in lenghts:
+        if curr_len <= min_merging_length and merge_len + curr_len <= max_merge_length:
+            merge_len += curr_len
+        else:
+            if merge_len > 0:
+                merge_arr.append(merge_len)
+                merge_len = 0
+            merge_arr.append(curr_len)
+    if merge_len > 0:
+        merge_arr.append(merge_len)
+    return np.array(merge_arr, dtype=np.int32)
+@numba.jit(nopython=True)
+def find_closest_indices(arr1, arr2):
+    """
+    Find indices of the closest elements in arr2 for each element in arr1.
+    Parameters:
+    arr1 (numpy array): The array containing the elements for which we want to find the closest elements in arr2.
+    arr2 (numpy array): The array in which we want to find the closest elements.
+    Returns:
+    indices (numpy array): The indices of the closest elements in arr2 for each element in arr1.
+    """
+    # Use searchsorted to find the indices where elements from arr1 should be inserted in arr2
+    indices = np.searchsorted(arr2, arr1, side="left")
+    indices_bis = np.clip(indices - 1, a_min=0, a_max=len(arr2) - 1)
+    dist_one = np.abs(arr2[indices] - arr1)
+    dist_bis = np.abs(arr2[indices_bis] - arr1)
+    return np.where(dist_one < dist_bis, indices, indices_bis)
+@dataclass
+class SentenceSplitterConfig:
+    columns: tp.List[str]
+    model_name: str = "sat-6l"
+    sentence_suffix: str = "_sentences"
+    sentence_threshold: float = 0.01
+    max_sentence_len: int = 256
+    min_text_length: int = 10
+    min_unique_chars: int = 0
+    fallback_separators: tp.List[str] = field(
+        default_factory=lambda: [
+            "...",
+            "\n",
+            "!",
+            "?",
+            ";",
+            ":",
+            ".",
+            ",",
+            "\t",
+            " ",
+        ]
+    )
+    device: str = "cuda"
+    remove_whitespace_before_inference: bool = False
+    batch_size: int = 256
+    block_size: int = 256
+    stride: int = 256
+    outer_batch_size: int = 1024
+    verbose: bool = False
+    pad_last_batch: bool = False
+class SentenceSplitter(BatchMapper):
+    def __init__(self, config: SentenceSplitterConfig):
+        super().__init__(config)
+        self.columns = config.columns
+        device = torch.device(config.device if torch.cuda.is_available() else "cpu")
+        try:
+            self.model = SaT(
+                self.config.model_name,
+                from_pretrained_kwargs={"local_files_only": True},
+            )
+        except Exception:
+            self.model = SaT(self.config.model_name)
+        if "cuda" in config.device:
+            self.model.half()
+        self.model.eval().to(device)
+    @torch.inference_mode()
+    def _resplit_long_sentences(self, col: pa.Array) -> pa.Array:
+        mask = pc.greater_equal(pc.utf8_length(col), self.config.max_sentence_len)
+        texts_to_resplit = col.filter(mask).to_pandas().to_list()
+        resplit_sentences = []
+        for text, probs in zip(
+            texts_to_resplit,
+            self.model.predict_proba(
+                texts_to_resplit,
+                stride=self.config.stride,
+                block_size=self.config.block_size,
+                batch_size=self.config.batch_size,
+                pad_last_batch=self.config.pad_last_batch,
+                remove_whitespace_before_inference=self.config.remove_whitespace_before_inference,
+                outer_batch_size=self.config.outer_batch_size,
+                verbose=self.config.verbose,
+            ),
+        ):
+            nb_split = round(len(probs) / self.config.max_sentence_len) + 1
+            sentence_threshold = np.partition(probs, -nb_split)[-nb_split]
+            sentences = indices_to_sentences(
+                text,
+                np.where(probs >= sentence_threshold)[0],
+                strip_whitespace=False,
+            )
+            resplit_sentences.append(sentences)
+        # if not, hard resplit with some separators
+        def _resplit(raw_sentences):
+            for separator in self.config.fallback_separators:
+                raw_sentences = [
+                    subchunk.strip()
+                    for sent in raw_sentences
+                    for subchunk in resplit(
+                        sent, max_length=self.config.max_sentence_len, sep=separator
+                    )
+                ]
+            return raw_sentences
+        np_mask = mask.to_pandas().to_numpy()
+        full_text = col.to_pandas().to_list()
+        output_sentences = []
+        j = 0
+        for i, text in enumerate(full_text):
+            if np_mask[i]:
+                output_sentences.append(_resplit(resplit_sentences[j]))
+                j += 1
+            else:
+                output_sentences.append([text])
+        return pa.array(output_sentences, type=pa.list_(pa.string()))
+    def resplit_long_sentences(self, col: pa.Array) -> pa.Array:
+        list_col = apply_on_nested_array(self._resplit_long_sentences, col)
+        reflatten_col = pl.from_arrow(list_col).list.eval(pl.element().explode())  # type: ignore
+        # remove single char repeated
+        if self.config.min_unique_chars > 0:
+            reflatten_col = reflatten_col.list.eval(
+                pl.when(
+                    pl.element().str.split("").list.n_unique()
+                    > self.config.min_unique_chars
+                )
+                .then(pl.element())
+                .drop_nulls()
+            )
+        return reflatten_col.to_arrow().cast(pa.list_(pa.string()))
+    @torch.inference_mode()
+    def basic_split_on_single_column(
+        self,
+        col: tp.Union[pa.Array, pa.ChunkedArray],
+    ) -> tp.Union[pa.Array, pa.ChunkedArray]:
+        if not (pa.types.is_large_string(col.type) or pa.types.is_string(col.type)):
+            raise ValueError("Column must be of type string")
+        texts = col.to_pandas().to_list()
+        texts = list(map(remove_emojis, texts))
+        long_texts = [t for t in texts if len(t) > self.config.min_text_length]
+        keep_texts = [
+            (idx, t)
+            for idx, t in enumerate(texts)
+            if len(t) <= self.config.min_text_length
+        ]
+        outputs = self.model.split(
+            long_texts,
+            threshold=self.config.sentence_threshold,
+            stride=self.config.stride,
+            block_size=self.config.block_size,
+            batch_size=self.config.batch_size,
+            pad_last_batch=self.config.pad_last_batch,
+            remove_whitespace_before_inference=self.config.remove_whitespace_before_inference,
+            outer_batch_size=self.config.outer_batch_size,
+            verbose=self.config.verbose,
+        )
+        sentences = []
+        for row in outputs:
+            sentences.append([s.strip() for s in row if s.strip()])
+        for idx, text in keep_texts:
+            sentences.insert(idx, text)
+        return pa.array(sentences, type=pa.list_(pa.string()))
+    def __call__(self, table: pa.Table) -> pa.Table:
+        for column in self.columns:
+            sentence_array = self.basic_split_on_single_column(table[column])
+            sentence_array = self.resplit_long_sentences(sentence_array)
+            table = table.append_column(
+                f"{column}{self.config.sentence_suffix}", sentence_array
+            )
+        return table
+@dataclass
+class FullPipelineConfig:
+    splitter_config: SentenceSplitterConfig
+    sonar_encoder_config: SonarTextEmbedderConfig
+    min_text_length: int = 10
+class FullPipeline(BatchMapper):
+    """
+    Creating sonar vectors from scratch.
+    Making sentences splits.
+    Computing sonar embeddings.
+    Config example requires only one input column:
+    - `text`
+    Note also that text should not be empty!
+    Example of config:
+        splitter_config = SentenceSplitterConfig(
+            columns=["text"],
+            model_name="sat-3l",
+            verbose=True,
+            sentence_threshold=0.02,
+            max_sentence_len=256,
+        )
+        sonar_encoder_config = SonarTextEmbedderConfig(
+            column_config=[LangColumnConfig("text_sentences", lang_value="eng_Latn")],
+            device="cuda",
+        )
+        full_config = FullPipelineConfig(
+            splitter_config=splitter_config,
+            sonar_encoder_config=sonar_encoder_config,
+        )
+    """
+    def __init__(self, config: FullPipelineConfig):
+        self.config = config
+        self.splitter = SentenceSplitter(self.config.splitter_config)
+        self.sonar_encoder = SonarTextBatchEmbedder(self.config.sonar_encoder_config)
+    def __call__(self, batch: pa.Table) -> pa.Table:
+        for col in self.config.splitter_config.columns:
+            batch = batch.filter(
+                pc.greater_equal(
+                    pc.utf8_length(batch[col]), self.config.min_text_length
+                )
+            )
+        batch = self.splitter(batch)
+        batch = self.sonar_encoder(batch)
+        gc.collect()
+        return batch

lcm/datasets/sentence_splitting.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+import codecs
+import re
+import typing as tp
+from functools import lru_cache
+import spacy
+import torch
+from sacremoses import MosesDetokenizer, MosesPunctNormalizer
+from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
+from stopes.utils.language_codes import language_code_to_short_code
+def remove_emojis(text: str) -> str:
+    emoji_pattern = re.compile(
+        "["
+        "\U0001f600-\U0001f64f"  # emoticons
+        "\U0001f300-\U0001f5ff"  # symbols & pictographs
+        "\U0001f680-\U0001f6ff"  # transport & map symbols
+        "\U0001f1e0-\U0001f1ff"  # flags (iOS)
+        "\U00002702-\U000027b0"
+        "\U000024c2-\U0001f251"
+        "\U0001f900-\U0001f9ff"  # Supplemental Symbols and Pictographs
+        "\U0001f700-\U0001f77f"  # Alchemical Symbols
+        "\U0001f780-\U0001f7ff"  # Geometric Shapes Extended
+        "\U0001f800-\U0001f8ff"  # Supplemental Arrows-C
+        "\U0001fa00-\U0001fa6f"  # Chess Symbols
+        "\U0001fa70-\U0001faff"  # Symbols and Pictographs Extended-A
+        "\U0001f6c0-\U0001f6cf"  # Miscellaneous Symbols and Pictographs (part)
+        "\U0001f6d0-\U0001f6d5"  # Miscellaneous Symbols and Pictographs (part)
+        "\U0001f6f0-\U0001f6fa"  # Miscellaneous Symbols and Pictographs (part)
+        "]+",
+        flags=re.UNICODE,
+    )
+    return emoji_pattern.sub(r"", text)
+def batched(inputs: tp.Iterable, batch_size=10000) -> tp.Iterable:
+    batch = []
+    for line in inputs:
+        batch.append(line)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    yield batch
+def filter_empty_string(text):
+    return not any(char.isalnum() for char in text)
+def remove_non_printable_chars(string):
+    return re.sub(r"[^\x20-\x7E]", "", string)
+def deescape_special_chars(string):
+    return codecs.decode(string, "unicode_escape")
+def resplit(text: str, max_length: int, sep: str) -> tp.List[str]:
+    words = text.split(sep)
+    result = []
+    current_piece = ""
+    for i, word in enumerate(words[:-1]):
+        # Append separator back to each word except the last
+        word += sep
+        if len(current_piece) + len(word) <= max_length:
+            current_piece += word
+        else:
+            if current_piece:
+                result.append(current_piece)
+            current_piece = word
+    # Handle the last word separately to avoid adding an extra separator
+    last_word = words[-1]
+    if len(current_piece) + len(last_word) <= max_length:
+        current_piece += last_word
+    else:
+        if current_piece:
+            result.append(current_piece)
+        current_piece = last_word
+    if current_piece:
+        result.append(current_piece)
+    return result
+@lru_cache
+def get_moses_normalizers(lang):
+    moses_lang = language_code_to_short_code(lang, try_replacing_with_macro=True)
+    mpn = MosesPunctNormalizer(lang=moses_lang)
+    mpn.substitutions = [(re.compile(r), sub) for r, sub in mpn.substitutions]
+    md = MosesDetokenizer(lang=moses_lang)
+    return mpn, md
+@lru_cache
+def get_splitter(lang: str, model_name: str = None):
+    moses_lang = language_code_to_short_code(lang, try_replacing_with_macro=True)
+    if model_name is None:
+        model_name = (
+            f"{moses_lang}_core_web_sm"
+            if moses_lang == "en"
+            else f"{moses_lang}_core_news_sm"
+        )
+    try:
+        if torch.cuda.is_available():
+            spacy.require_gpu()
+        spacy_nlp = spacy.load(model_name, enable=["sentencizer"])
+        spacy_nlp.add_pipe("sentencizer")
+        def spacy_splitter(text):
+            for batch in batched(text, batch_size=999_000):
+                for sent in spacy_nlp("".join(batch)).sents:
+                    yield str(sent)
+        return spacy_splitter
+    except ModuleNotFoundError:
+        print(
+            f"Spacy splitter not found for {lang}, switching to stopes implementation"
+        )
+        return get_split_algo(lang[:3], "default")
+class ResplitSentenceSplitter:
+    def __init__(
+        self,
+        fallback_separators=(".", "!", "?", "...", "\n", ";", ",", ":", ">", " "),
+    ):
+        self.fallback_separators = fallback_separators
+    def __call__(
+        self, document: str, lang: str = "eng_Latn", max_length: int = 200
+    ) -> tp.List[str]:
+        mpn, md = get_moses_normalizers(lang)
+        # XXX: two below are not various language friendly
+        # document = deescape_special_chars(document)
+        # document = remove_non_printable_chars(document)
+        document = remove_emojis(document)
+        raw_sentences = get_splitter(lang)(document)
+        for separator in self.fallback_separators or []:
+            raw_sentences = [
+                subchunk.strip()
+                for sent in raw_sentences
+                for subchunk in resplit(sent, max_length=max_length, sep=separator)
+            ]
+        return [
+            mpn.normalize(md.detokenize(sent.strip().split()))
+            for sent in raw_sentences
+            if len(sent) > 1 and not filter_empty_string(sent)
+        ]

lcm/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+import torch
+from fairseq2.models.sequence import SequenceBatch
+def move_eos_to_the_end(
+    batch: SequenceBatch, pad_token_id: int = 0, eos_token_id: int = 3
+) -> SequenceBatch:
+    """
+    Convert a decoder-input batch (with the eos token in the beginning) to a decoder-output batch
+    (with eos in the end) of the same shape.
+    Note that this processing is missing two potentially critical issues:
+    1) If the sequence end has been truncated away, EOS token will be appended erroneously.
+    2) The language code token is still included in the loss computation (we may want to avoid it).
+    """
+    # strip the EOS token prepended to the input and add an empty token in the end
+    seqs = torch.cat(
+        [
+            batch.seqs[:, 1:],
+            torch.zeros_like(batch.seqs[:, :1]) + pad_token_id,
+        ],
+        dim=-1,
+    )
+    # fill the last real token in the batch with the eos value
+    if batch.padding_mask:
+        seqs[
+            torch.arange(seqs.shape[0], dtype=torch.int32),
+            batch.padding_mask.seq_lens - 1,
+        ] = eos_token_id
+    else:
+        seqs[:, -1] = eos_token_id
+    result = SequenceBatch(
+        seqs=seqs,
+        padding_mask=batch.padding_mask,
+    )
+    return result

lcm/models/two_tower_diffusion_lcm/loader.py CHANGED Viewed

@@ -6,6 +6,7 @@
 from fairseq2.models.config_loader import StandardModelConfigLoader
 from fairseq2.models.loader import StandardModelLoader, load_model
 from lcm.models.base_lcm.loader import convert_lcm_checkpoint
 from lcm.models.two_tower_diffusion_lcm.builder import (
@@ -23,11 +24,12 @@ load_two_tower_diffusion_lcm_config = StandardModelConfigLoader(
 )
-load_two_tower_diffusion_lcm_model = StandardModelLoader(  # type: ignore # FIXME
     config_loader=load_two_tower_diffusion_lcm_config,
     factory=create_two_tower_diffusion_lcm_model,
     checkpoint_converter=convert_lcm_checkpoint,
     restrict_checkpoints=False,
 )
 load_model.register(

 from fairseq2.models.config_loader import StandardModelConfigLoader
 from fairseq2.models.loader import StandardModelLoader, load_model
+from Patches import Patch_TorchLoader
 from lcm.models.base_lcm.loader import convert_lcm_checkpoint
 from lcm.models.two_tower_diffusion_lcm.builder import (
 )
+load_two_tower_diffusion_lcm_model = StandardModelLoader(
     config_loader=load_two_tower_diffusion_lcm_config,
     factory=create_two_tower_diffusion_lcm_model,
     checkpoint_converter=convert_lcm_checkpoint,
     restrict_checkpoints=False,
+    tensor_loader=Patch_TorchLoader.load_tensors,  # 🔥 the key patch
 )
 load_model.register(

lcm/train/__main__.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+import asyncio
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+import hydra
+import submitit
+from omegaconf import DictConfig, OmegaConf
+from omegaconf.omegaconf import open_dict, read_write
+from stopes.core import Requirements, StopesModule
+from lcm.train.common import get_trainer
+from lcm.utils.common import setup_conf
+setup_conf()
+class TrainModule(StopesModule):
+    def requirements(self) -> Requirements:
+        return self.config.requirements
+    def run(self, iteration_value: Optional[Any] = None, iteration_index: int = 0):
+        # Add module.name to the config's log_folder
+        with read_write(self.config):
+            self.config.log_folder = Path(self.config.log_folder) / self.name()
+        trainer = get_trainer(self.config)
+        # trainer should have a run() method
+        trainer.run()
+    def should_retry(
+        self,
+        ex: Exception,
+        attempt: int,
+        iteration_value: Optional[Any] = None,
+        iteration_index: int = 0,
+    ) -> bool:
+        # Before retrying the failed train run, clean the environment to make sure
+        # fs2 ProcessGroupGang can set up properly without raising error if the
+        # gang is not set up reliably
+        with submitit.helpers.clean_env():
+            return "ValueError" not in str(ex)
+    def name(self):
+        """
+        implement this if you want to give a fancy name to your job
+        """
+        name = self.config.get(
+            "experiment_name", f"{self.__class__.__name__}_{self.sha_key()[:10]}"
+        )
+        return name
+@dataclass
+class TrainingConfig:
+    trainer: DictConfig
+    launcher: DictConfig
+    dry_run: bool = False
+async def run(config: TrainingConfig):
+    # dump the all config to the outputs config log
+    dump_dir = Path(config.launcher.config_dump_dir)
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    OmegaConf.resolve(config)  # type: ignore
+    # XXX: do we want to promote datasets configs from thier names to the final params
+    OmegaConf.save(
+        config=config,
+        f=str(dump_dir / "all_config.yaml"),
+    )
+    train_config = config.trainer
+    # If launcher.cluster = debug set debug in the trainer to True
+    with open_dict(train_config):
+        if config.launcher.cluster == "debug":
+            train_config.debug = True
+        train_config.log_folder = config.launcher.log_folder
+    if getattr(config, "dry_run", False):
+        trainer = get_trainer(train_config)
+        print(f"Trainer: {trainer}")
+        print(f"Train config: {getattr(trainer, 'config')}")
+        return
+    launcher = hydra.utils.instantiate(config.launcher)
+    train_module = TrainModule(train_config)
+    wait_on = launcher.schedule(train_module)
+    await wait_on
+@hydra.main(
+    version_base="1.2",
+    config_path="../../recipes/train",
+    config_name="defaults.yaml",
+)
+def main(config: TrainingConfig) -> None:
+    """
+    Launch a train module from CLI.
+    Example:
+    ```sh
+    python -m lcm.train +pretrain=mse
+    ```
+    in this example, `pretrain` is a folder under the `recipes` directory and `mse`
+    is a yaml file with the trainer configuration.
+    This yaml file must be in the `trainer` package (i.e. start with the `# @package trainer`
+    hydra directive).
+    It must contain a `__trainer__` entry defining the constructor for the trainer.
+    You can use `-c job` to see the configuration without running anything. You can use
+    `dry_run=true` to initialize the trainer from the configuration and make sure it's correct
+    without running the actual training. To debug the jobs, you can use `launcher.cluster=debug`
+    """
+    asyncio.run(run(config))
+if __name__ == "__main__":
+    main()

lcm/train/common.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#  Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+from inspect import signature
+from typing import Any, Dict, Protocol, Union, runtime_checkable
+import hydra
+from omegaconf import DictConfig, OmegaConf, read_write
+from lcm.utils.common import promote_config
+TRAINER_KEY = "_trainer_"
+@runtime_checkable
+class Trainer(Protocol):
+    """Abstract trainer in LCM"""
+    def run(self) -> Any: ...
+def _parse_training_config(train_config: DictConfig):
+    """Return the TrainingConfig object from the omegaconf inputs"""
+    # The train_config should have 2 keys "_target_" and "_trainer_"
+    # the config is set to read-only within stopes module __init__
+    assert TRAINER_KEY in train_config, (
+        f"The trainer configuration is missing a {TRAINER_KEY} configuration, "
+        "you need to specify a Callable to initialize your config."
+    )
+    trainer_cls_or_func = train_config.get(TRAINER_KEY)
+    try:
+        trainer_obj = hydra.utils.get_object(trainer_cls_or_func)
+        sign = signature(trainer_obj)
+        assert len(sign.parameters) == 1 and "config" in sign.parameters, (
+            f'{trainer_cls_or_func} should take a single argument called "config"'
+        )
+        param_type = sign.parameters["config"].annotation
+        OmegaConf.resolve(train_config)
+        with read_write(train_config):
+            del train_config._trainer_
+        typed_config = promote_config(train_config, param_type)
+        return trainer_obj, typed_config
+    except Exception as ex:
+        raise ValueError(
+            f"couldnt parse the train config: {train_config}.", str(ex)
+        ) from ex
+def get_trainer(train_config: DictConfig) -> Trainer:
+    trainer_obj, typed_config = _parse_training_config(train_config)
+    return trainer_obj(typed_config)
+def _is_missing(config: Union[DictConfig, Dict], attr: str) -> bool:
+    if isinstance(config, Dict):
+        return attr in config and config[attr]
+    if OmegaConf.is_missing(config, attr):
+        return True
+    if not hasattr(config, attr) or not getattr(config, attr):
+        return True
+    return False

lcm/train/criterion.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Literal
+from fairseq2.logging import get_log_writer
+from omegaconf import MISSING
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel as FSDP,
+)
+from torch.nn import Module
+from torch.nn.parallel import DistributedDataParallel as DDP
+from lcm.train.metrics import LossTerm
+logger = get_log_writer(__name__)
+@dataclass
+class CriterionConfig:
+    """A dataclass for criterion parameters"""
+    name: str = MISSING
+    """Name of the criterion, a unique identifier used in the CriterionsFactory"""
+    reduction: Literal["sum", "mean"] = "sum"
+    """How to reduce the loss across samples"""
+class Criterion:
+    """And abstract class for training criterions"""
+    def __init__(
+        self,
+        config: CriterionConfig,
+        model: Module,
+    ):
+        self.config = config
+        self.model = model
+        self.summands: List[str] = []
+        """ A list of loss term names to track during training.
+            This will create metric bags for each
+        """
+        self.reduction = config.reduction
+    @property
+    def throughput_metric_name(self) -> str:
+        return "num_target_elements"
+    @property
+    def base_model(self):
+        """A pointer to the unwrapped model if training with FSDP/DDP"""
+        if isinstance(self.model, (DDP, FSDP)):
+            _model = self.model.module
+        else:
+            _model = self.model
+        return _model
+    @abstractmethod
+    def __call__(self, batch) -> LossTerm:
+        """
+        Computes the loss given an input batch.
+        The model's forward pass is performed here
+        """
+class CriterionsFactory:
+    """Factory for LCM criterions"""
+    registry: Dict[str, Any] = {}
+    @classmethod
+    def build_criterion(cls, name: str, **kwargs) -> Any:
+        """build the criterion of choice from within the trainer"""
+        criterion_class = cls.registry[name]
+        criterion = criterion_class(**kwargs)
+        return criterion
+    @classmethod
+    def register(cls, name: str) -> Callable:
+        """decorator for adding criterions to the registry"""
+        def inner_wrapper(wrapped_class: Criterion) -> Callable:
+            assert name not in cls.registry, (
+                f"{name} is already register as a criterion"
+            )
+            cls.registry[name] = wrapped_class
+            return wrapped_class
+        return inner_wrapper

lcm/train/lcm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#

lcm/train/lcm/criterion.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from fairseq2.logging import get_log_writer
+from torch import Tensor
+from lcm.datasets.batch import LCMInput, LCMStyle
+from lcm.models.abstract_lcm import AbstractLCModel
+from lcm.models.sonar_normalizer import SonarNormalizer
+from lcm.train.criterion import Criterion, CriterionConfig
+from lcm.train.metrics import LossTerm
+logger = get_log_writer(__name__)
+def compute_standard_mse(
+    flattened_predictions: Tensor,
+    flattened_target: Tensor,
+    scales: Optional[Tensor] = None,
+    normalizer: Optional[SonarNormalizer] = None,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Computes MSE loss between predictions and targets.
+    Note that, unlike regular MSE with mean/sum reduction, we first sum across channels
+    before later reducing in the criterion.
+    Parameters:
+        flattened_predictions (Tensor): The predictions in (N, C)
+        flattened_target (Tensor): The targets in (N, C)
+        scales (Optional[Tensor]): If not None, each channel will be weighted by the corresponding scale.
+        epsilon: A small epsilon to be added before taking the square root of the l2 distance
+        normalizer (Optional[SonarNormalizer]): If a normalizer is provided,
+                the predictions and targets will first be denormalized before computing the RMSE loss
+    Returns:
+        mse (Tensor): the MSE loss with optional scaling
+        plain_mse (Tensor): The MSE loss without any scaling (for logging)
+    """
+    assert flattened_predictions.dim() == 2, (
+        "Expecting two-dimensional predictions and targets. ",
+        f"Found targets in {flattened_target.size()} and ",
+        f"predictions in {flattened_predictions.size()}",
+    )
+    assert flattened_predictions.shape == flattened_target.shape, (
+        "Expecting predictions and targets of the same shape ",
+        f"Received predictions {flattened_predictions.shape} and targets {flattened_target.shape}",
+    )
+    if scales is not None:
+        assert scales.dim() == 1, (
+            "Expecting a uni-dimensional tensor of scales ",
+            f"Found a tensor with dimension {scales.dim()}",
+        )
+        assert len(scales) == flattened_target.shape[-1], (
+            "The provided scales should have the same size as the target channels. ",
+            f"Found {len(scales)} expected {flattened_target.shape[-1]}",
+        )
+    if normalizer is not None:
+        assert hasattr(normalizer, "denormalize"), (
+            "The provided normalizer has not method `denormalize`"
+        )
+        flattened_predictions = normalizer.denormalize(flattened_predictions)
+        flattened_target = normalizer.denormalize(flattened_target)
+    full_mse = torch.nn.functional.mse_loss(
+        flattened_predictions, flattened_target, reduction="none"
+    )
+    plain_mse = full_mse.sum(dim=-1)
+    if scales is not None:
+        full_mse = full_mse * scales.unsqueeze(0)
+        mse = full_mse.sum(dim=-1)
+    else:
+        mse = plain_mse
+    return mse, plain_mse
+@dataclass
+class LCMCriterionConfig(CriterionConfig):
+    compute_rmse: bool = True
+    """If `True` take the square-root of MSE.
+    This is for now `True` by default for backward compatibility"""
+class LCMCriterion(Criterion):
+    """And abstract class for the LCM's criterions"""
+    config: LCMCriterionConfig
+    def __init__(
+        self,
+        config: LCMCriterionConfig,
+        model: AbstractLCModel,
+        style: LCMStyle = LCMStyle.UNSUPERVISED,
+    ):
+        super().__init__(config, model)
+        self.style = style
+        # Summands for log/tb recorders
+        self.summands = ["mse_loss", "reconstruction_loss"]
+        self.normalize_in_criterion = (
+            self.base_model.config.sonar_normalizer_name is not None
+        )
+    @property
+    def sonar_normalizer(self) -> Optional[SonarNormalizer]:
+        if hasattr(self.base_model, "sonar_normalizer"):
+            return self.base_model.sonar_normalizer
+        elif hasattr(self.base_model, "frontend") and hasattr(
+            self.base_model.frontend, "sonar_normalizer"
+        ):
+            return self.base_model.frontend.sonar_normalizer
+        else:
+            logger.warning(
+                "Couldn't find the model's `sonar_normalizer`, defaulting to None"
+            )
+            return None
+    @property
+    def throughput_metric_name(self) -> str:
+        return "num_target_elements"
+    @abstractmethod
+    def __call__(self, batch: LCMInput) -> LossTerm:
+        """
+        Computes the loss given an input batch.
+        The model's forward pass is performed here
+        Input batch is LCMInput  (see `lcm.datasets.batch`):
+        """

lcm/train/lcm/trainer.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+from dataclasses import dataclass, field
+from typing import Dict, List, Mapping, Optional, Union
+from fairseq2.assets import AssetCard
+from fairseq2.checkpoint import FileCheckpointManager
+from fairseq2.gang import Gang
+from fairseq2.logging import get_log_writer
+from fairseq2.metrics import MetricRecorder
+from fairseq2.optim import DynamicLossScaler
+from fairseq2.optim.lr_scheduler import AbstractLRScheduler
+from fairseq2.utils.profiler import Profiler, Stopwatch
+from fairseq2.utils.rng import RngBag
+from omegaconf import MISSING
+from stopes.core import Requirements
+from torch.nn import Module
+from torch.optim import Optimizer
+import torch
+from lcm.datasets.configs import ParquetDatasetConfig
+from lcm.datasets.dataloader import LCMDataLoader
+from lcm.datasets.dataloading import ds_name
+from lcm.models.abstract_lcm import AbstractLCModelConfig
+from lcm.models.base_lcm.loader import load_base_lcm_model
+from lcm.train.criterion import CriterionsFactory
+from lcm.train.metrics import LCMMetricBag
+from lcm.train.mse_lcm.criterion import ReconstructionCriterionConfig
+from lcm.train.trainer import Trainer, TrainerBuilder, TrainingConfig
+from lcm.utils.card_utils import create_model_card
+logger = get_log_writer(__name__)
+@dataclass
+class LCMTrainingConfig(TrainingConfig):
+    """Holds the configuration of an LCM training job."""
+    training_data: List[ParquetDatasetConfig] = field(default_factory=list)
+    """The datasets to train with."""  # TODO use dataset cards
+    validation_data: List[ParquetDatasetConfig] = field(default_factory=list)
+    """The datasets to validate on."""  # TODO use dataset cards
+    model_config_or_name: Union[AbstractLCModelConfig, str, None] = None
+    """The model configuration or name to train."""
+    requirements: Requirements = field(
+        default_factory=lambda: Requirements(
+            nodes=1,
+            tasks_per_node=8,
+            gpus_per_node=8,
+            cpus_per_task=8,
+            mem_gb=256,
+            timeout_min=3 * 24 * 60,
+            constraint="volta32gb",
+        )
+    )
+    """The scheduling requirements for this trainer"""
+    criterion: ReconstructionCriterionConfig = MISSING
+    """The MSE loss is the default base criterion used in either the `lcm` or `mse_lcm` trainers"""
+    max_subword_length: int = 512
+    """ Max subword length used to truncate seqs during sonar decoder backprop"""
+class LCMTrainer(Trainer):
+    config: LCMTrainingConfig
+    model: Module
+    training_data_loader: LCMDataLoader
+    validation_data_loader: Optional[LCMDataLoader]
+    gang: Gang
+    optimizer: Optimizer
+    loss_scaler: DynamicLossScaler
+    lr_scheduler: AbstractLRScheduler
+    rng_bag: RngBag
+    step_nr: int
+    train_metric_bag: LCMMetricBag
+    valid_metric_bag: Mapping[str, LCMMetricBag]
+    metric_recorders: List[MetricRecorder]
+    profiler: Profiler
+    stopwatch: Stopwatch
+    def __init__(
+        self,
+        config: LCMTrainingConfig,
+        model: Module,
+        training_data_loader: LCMDataLoader,
+        validation_data_loader: Optional[LCMDataLoader],
+        gang: Gang,
+        checkpoint_manager: FileCheckpointManager,
+        rng_bag: RngBag,
+        stopwatch: Stopwatch,
+        card_metadata: Dict,
+    ) -> None:
+        super().__init__(
+            config,
+            model,
+            training_data_loader,
+            validation_data_loader,
+            gang,
+            checkpoint_manager,
+            rng_bag,
+            stopwatch,
+            card_metadata=card_metadata,
+        )
+    def setup_criterion(self):
+        return CriterionsFactory.build_criterion(
+            name=self.config.criterion.name,
+            config=self.config.criterion,
+            model=self.model,
+        )
+    def setup_metric_bags(self):
+        self.train_metric_bag = LCMMetricBag(
+            self.gang,
+            loss_summands=self.criterion.summands,
+            reduction=self.criterion.reduction,
+        )
+        self.register_non_stateful(
+            "valid_metric_bag",
+            {
+                ds_name(dataset): LCMMetricBag(
+                    self.gang,
+                    loss_summands=self.criterion.summands,
+                    reduction=self.criterion.reduction,
+                )
+                for dataset in self.config.validation_data
+            },
+        )
+    def create_model_card_for_last_checkpoint(
+        self, is_final: bool = True, **card_kwargs
+    ) -> Optional[AssetCard]:
+        """Create a model card based on the last saved
+        checkpoint and the model config."""
+        current_step_number: Optional[int] = None
+        if is_final:
+            steps = self.checkpoint_manager.get_step_numbers()
+            current_step_number = steps[-1] if len(steps) else None
+        else:
+            current_step_number = self.checkpoint_manager._get_checkpoint_step_nr()
+        if current_step_number is None:
+            logger.warning(
+                "No checkpoint was saved, the final model card wil not be created"
+            )
+            return None
+        cp_fn = (
+            self.checkpoint_manager._checkpoint_dir
+            / f"step_{current_step_number}"
+            / "model.pt"  # type: ignore
+        )
+        card = create_model_card(
+            checkpoint_path=cp_fn.absolute(),
+            model_arch=self.card_metadata["model_arch"],
+            model_config=self.card_metadata["model_config"],
+            model_type=self.card_metadata["model_type"],
+            **card_kwargs,
+        )
+        return card
+class LCMTrainerBuilder(TrainerBuilder):
+    config: LCMTrainingConfig
+    def __init__(self, config: LCMTrainingConfig):
+        super().__init__(config)
+    def load_data(self):
+        """Load training and validation data"""
+        training_data_loader = LCMDataLoader(
+            data_config=self.config.data_loading_config,
+            datasets=self.config.training_data,
+            max_subword_length=self.config.max_subword_length,
+            dtype=self.dtype,
+            gang=self.gang,
+        )
+        validation_data_loader = LCMDataLoader(
+            data_config=self.config.validation_data_loading_config,
+            datasets=self.config.validation_data,
+            max_subword_length=self.config.max_subword_length,
+            dtype=self.dtype,
+            gang=self.gang,
+        )
+        return training_data_loader, validation_data_loader
+    @property
+    def model_loader(self):
+        """A fairseq2 ModelLoader"""
+        return load_base_lcm_model
+    def build_trainer(self):
+        """Build the trainer by loading data and
+        setting up the model for training"""
+        training_data_loader, validation_data_loader = self.load_data()
+        checkpoint_manager = FileCheckpointManager(
+            self.config.output_dir.joinpath("checkpoints"),
+            self.gang,
+        )
+        self.has_checkpoint = checkpoint_manager.has_checkpoint()
+        model = self.create_model()
+        # Force all model parameters to bfloat16 regardless of submodule defaults
+        model = model.to(dtype=torch.bfloat16)
+        model = self.maybe_load_model(model)
+        model = self.maybe_freeze_parameters(model)
+        # If using the META device, we need to move the model to gang.device
+        wrapped_model = None
+        if self.use_fsdp:
+            wrapped_model = self.wrap_model_with_fsdp(model)
+        elif self.use_ddp:
+            wrapped_model = self.wrap_model_with_ddp(model)  # type: ignore
+        trainer = LCMTrainer(
+            self.config,  # type: ignore
+            wrapped_model or model,
+            training_data_loader,
+            validation_data_loader,
+            self.gang,
+            checkpoint_manager,
+            self.rng_bag,
+            self.stopwatch,
+            card_metadata=self.card_metadata,
+        )
+        trainer.setup()
+        if self.has_checkpoint:
+            trainer.restore()
+        return trainer
+def prepare_lcm_trainer(config: LCMTrainingConfig) -> LCMTrainer:
+    """Create an LCM Trainer.
+    :param config: The training configuration.
+    """
+    return LCMTrainerBuilder(config).build_trainer()

lcm/train/metrics.py ADDED Viewed

	@@ -0,0 +1,449 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+from collections.abc import MutableMapping
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+import torch
+from fairseq2.gang import Gang
+from fairseq2.logging import get_log_writer
+from fairseq2.metrics import (
+    MetricBag,
+    format_as_float,
+    format_as_int,
+    format_as_seconds,
+)
+from fairseq2.metrics.recorder import (
+    MetricRecorder,
+    _metric_formatters,
+    register_metric_formatter,
+)
+from fairseq2.typing import override
+from torch import Tensor
+from torch.cuda import _get_device_index
+from torcheval.metrics import Max, Mean, Sum, Throughput
+logger = get_log_writer(__name__)
+format_as_percent = partial(format_as_int, postfix="%")
+def flatten_dict(d: MutableMapping, parent_key: str = "", sep: str = ".") -> Dict:
+    """
+    A helper function to flatten nested dictionaries
+    Example. With a training config like
+        config = {
+        'data': {
+            'training': {'batch_size': 10},
+            'validation': {'batch_size': 2}
+            },
+        'model': {'model_dim': 1024},
+        'use_fsdp': True
+        }
+        The flat config will be:
+            {
+            'data.training.batch_size': 10,
+            'data.validation.batch_size': 2,
+            'model.model_dim': 1024,
+            'use_fsdp': True
+            }
+        This helper is used to convert our nested training config into a flat
+        dictionary for Tensoarboard's HParams conusmption
+    """
+    items: List = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+def get_allocated_gpu_memory(device):
+    """
+    Get allocated memory in GiB for GPU devices
+    """
+    if device.type == "cpu":
+        return 0, 0
+    device = _get_device_index(device, optional=True)
+    memory_stats = torch.cuda.memory_stats(device=device)
+    current_usage = memory_stats["allocated_bytes.all.current"] / (1024**3)
+    peak_usage = memory_stats["allocated_bytes.all.peak"] / (1024**3)
+    return current_usage, peak_usage
+@dataclass
+class LossTerm:
+    """Dataclass for a batch loss term"""
+    value: Tensor
+    """The final loss to be optimized"""
+    batch_size: int
+    num_target_elements: Union[int, float]
+    summands: Dict[str, Tuple[Any, Any]] = field(default_factory=lambda: {})
+    """A dictionary of loss terms to record. Each term is a tuple of (loss, number of elements)
+       The second term is optional; if None, we will use `num_target_elements` when aggregating"""
+class LCMMetricBag(MetricBag):
+    """Holds the common metrics of an LCM."""
+    loss: Mean
+    batch_size: Sum
+    elements_per_batch: Mean
+    elements_per_second: Throughput
+    num_target_elements: Sum
+    total_num_target_elements: Sum
+    grad_norm: Mean
+    def __init__(
+        self, gang: Gang, loss_summands: Sequence[str] = [], reduction: str = "sum"
+    ) -> None:
+        """
+        :param gang:
+            The gang to sync metrics across all processes.
+        """
+        super().__init__(gang)
+        # temporary fix:
+        self.reduction = reduction
+        d = gang.device
+        # A temporary solution to track as many loss terms as we explore
+        self.loss_summands = loss_summands
+        self.register_metric("loss", Mean(device=d), persistent=False)
+        # this is the effective batch size
+        self.register_metric("batch_size", Sum(device=d), persistent=False)
+        self.register_metric("elements_per_batch", Mean(device=d), persistent=False)
+        self.register_metric(
+            "elements_per_second", Throughput(device=d), persistent=False
+        )
+        self.register_metric("gpu_memory_usage", Max(device=d), persistent=False)
+        self.register_metric("gpu_peak_memory_usage", Max(device=d), persistent=False)
+        # self.register_metric("ram_percentage", Max(device=d), persistent=False)
+        # self.register_metric("cpu_percentage", Max(device=d), persistent=False)
+        for summand in self.loss_summands:
+            self.register_metric(summand, Mean(device=d), persistent=False)
+        # The number of target tokens in a parallel batch. Used for computing throughput
+        self.register_metric("num_target_elements", Sum(device=d), persistent=False)
+        # The total_num_target_elements is persistent and is supposed to track the
+        # total number of tokens consumed since training started
+        self.total_num_target_elements = Sum(device=d)
+    def register_adaln_metric(self, module_name: str):
+        for block in ["mha", "ffn"]:
+            for tensor in [
+                "shift",
+                "scale",
+                "gate",
+            ]:
+                self.register_metric(
+                    f"{module_name}_{block}_{tensor}_mean",
+                    Mean(device=self._gang.device),
+                    persistent=False,
+                )
+                self.register_metric(
+                    f"{module_name}_{block}_{tensor}_std",
+                    Mean(device=self._gang.device),
+                    persistent=False,
+                )
+                # formatters
+                register_metric_formatter(
+                    f"{module_name}_{block}_{tensor}_mean",
+                    f"{module_name}_{block}_{tensor}_mean",
+                    1000,
+                    format_as_float,
+                )
+                register_metric_formatter(
+                    f"{module_name}_{block}_{tensor}_std",
+                    f"{module_name}_{block}_{tensor}_std",
+                    1000,
+                    format_as_float,
+                )
+    def register_module_metric(self, module_name: str):
+        for tensor in [
+            "input_gradient",
+            "output_gradient",
+            "input_activations",
+            "output_activations",
+        ]:
+            self.register_metric(
+                f"{module_name}_{tensor}_mean",
+                Mean(device=self._gang.device),
+                persistent=False,
+            )
+            self.register_metric(
+                f"{module_name}_{tensor}_std",
+                Mean(device=self._gang.device),
+                persistent=False,
+            )
+            # formatters
+            register_metric_formatter(
+                f"{module_name}_{tensor}_mean",
+                f"{module_name}_{tensor}_mean",
+                1000,
+                format_as_float,
+            )
+            register_metric_formatter(
+                f"{module_name}_{tensor}_std",
+                f"{module_name}_{tensor}_std",
+                1000,
+                format_as_float,
+            )
+    @torch.inference_mode()
+    def update(
+        self,
+        losses: Sequence[LossTerm],
+    ) -> None:
+        """Update the metrics.
+        :param output:
+            The losses generated by the model for each batch
+        :param elapsed_time:
+            The total elapsed time to read and process batches
+        """
+        loss = torch.zeros((), dtype=torch.float64)
+        loss_summands = {
+            s: torch.zeros((), dtype=torch.float64) for s in self.loss_summands
+        }
+        # Denominator to normalize the loss summands, if -1,
+        # we will default to normalizing with `num_target_elements`
+        loss_summands_numel = {
+            s: -torch.ones((), dtype=torch.long) for s in self.loss_summands
+        }
+        batch_size = torch.zeros((), dtype=torch.int64)
+        num_target_elements = torch.zeros((), dtype=torch.int64)
+        # Only in the case of using gradient accumulation that `losses` will be a non-singleton
+        for batch_loss in losses:
+            loss += float(batch_loss.value)
+            for s in self.loss_summands:
+                loss_term = batch_loss.summands.get(s, (0.0, None))
+                loss_summands[s] += float(loss_term[0])
+                if loss_term[1] is not None and not loss_term[1] == -1:
+                    if loss_summands_numel[s] == -1:
+                        loss_summands_numel[s] = torch.zeros((), dtype=torch.int64)
+                    loss_summands_numel[s] += loss_term[1]
+            batch_size += batch_loss.batch_size
+            num_target_elements += batch_loss.num_target_elements
+        # Misleading normalization in the metric bag with reduction == "mean"
+        # Kept here for backward compatibility
+        # Any normalization here is only for reporting and doesn't impact optimization
+        if self.reduction == "sum":
+            loss /= num_target_elements
+            keys = list(loss_summands)
+            for k in keys:
+                denom = loss_summands_numel[k]
+                if denom == -1:
+                    denom = num_target_elements
+                loss_summands[k] /= denom + 1e-6
+        self.loss.update(loss, weight=num_target_elements)
+        for s in loss_summands:
+            weight = loss_summands_numel[s]
+            if weight == -1:
+                weight = num_target_elements
+            getattr(self, s).update(loss_summands[s], weight=weight)
+        self.batch_size.update(batch_size)
+        self.elements_per_batch.update(num_target_elements)
+        self.num_target_elements.update(num_target_elements)
+        # update the cumulative metric
+        self.total_num_target_elements.update(num_target_elements)
+        # Get GPU memory usage
+        gpu_memory_usage, gpu_peak_memory_usage = get_allocated_gpu_memory(
+            self._gang.device
+        )
+        self.gpu_memory_usage.update(torch.tensor(gpu_memory_usage))
+        self.gpu_peak_memory_usage.update(torch.tensor(gpu_peak_memory_usage))
+    def reset_batch_metrics(self) -> None:
+        """Reset the batch metrics to their initial state."""
+        self.loss.reset()
+        for s in self.loss_summands:
+            getattr(self, s).reset()
+        self.batch_size.reset()
+        self.elements_per_batch.reset()
+        self.elements_per_second.reset()
+        self.grad_norm.reset()
+        self.gpu_memory_usage.reset()
+        self.gpu_peak_memory_usage.reset()
+        # self.ram_percentage.reset()
+        # self.cpu_percentage.reset()
+## Weight and Biases recorder
+try:
+    import wandb  # type: ignore[import-not-found]
+except ImportError:
+    has_wandb = False
+else:
+    has_wandb = True
+class LCMWandBRecorder(MetricRecorder):
+    """Records metric values to Weights & Biases."""
+    defined_runs: Set[str] = set()
+    def __init__(
+        self,
+        project: Optional[str] = None,
+        name: Optional[str] = None,
+        output_dir: Optional[Path] = None,
+        config: Dict[str, Any] = {},
+        **kwargs,
+    ) -> None:
+        """
+        :param project: A project to organise this run with other experiments, if none, the run will go under `uncategorized`.
+        :param name: A unique name for your run, if none is given, a random name will be generated
+        :param output_dir: The base directory under which to store the W&B files. You don't have to provide this.
+        :param config: A dictionary of key-value pairs to be stored as the experiment's config. (akin to hparams in tb)
+        :param kwargs: Additional arguments to pass to wandb.init()
+        In order to use W&B, run `wandb login` from the command line and enter
+        the API key when prompted.
+        """
+        if not has_wandb:
+            log = get_log_writer(__name__)
+            log.warning("wandb not found. Please install it with `pip install wandb`.")  # fmt: skip
+            self._run = None
+        else:
+            if output_dir:
+                output_dir.mkdir(parents=True, exist_ok=True)
+            self._run = wandb.init(  # type: ignore
+                project=project,
+                name=name,
+                dir=output_dir,
+                resume="allow",
+                config=config,
+                **kwargs,
+            )
+    def _define_run(self, run: str):
+        if run in self.defined_runs:
+            return
+        # https://docs.wandb.ai/guides/track/log/customize-logging-axes/
+        wandb.define_metric(f"{run}/step")
+        wandb.define_metric(f"{run}/*", step_metric=f"{run}/step")
+    @override
+    def record_metrics(
+        self,
+        run: str,
+        values: Mapping[str, Any],
+        step_nr: Optional[int] = None,
+        *,
+        flush: bool = True,
+    ) -> None:
+        if self._run is None:
+            return
+        self._define_run(run)
+        for name, value in values.items():
+            formatter = _metric_formatters.get(name)
+            if formatter is None:
+                display_name = name
+            else:
+                display_name = formatter.display_name
+            self._run.log({f"{run}/{display_name}": value, f"{run}/step": step_nr})
+    @override
+    def close(self) -> None:
+        if self._run is not None:
+            self._run.finish()
+lcm_metric_formatters: Dict[str, Tuple[str, int, Callable[[Any], str]]] = {
+    # fmt: off
+    "loss": ("Loss", 100, format_as_float),
+    "nll_loss": ("NLL Loss", 100, format_as_float),
+    "mse_loss": ("MSE Loss", 100, format_as_float),
+    "contrastive_loss": ("Contrastive Loss", 110, format_as_float),
+    "reconstruction_loss": ("Reconstruction loss", 110, format_as_float),
+    "unnormalized_reconstruction_loss": (
+        "Unnormalized Reconstruction Loss",
+        110,
+        format_as_float,
+    ),
+    "kld": ("KLD loss", 110, format_as_float),
+    "encoder_mse_loss": ("Encoder MSE loss", 110, format_as_float),
+    "decoder_ce_loss": ("Decoder CE loss", 110, format_as_float),
+    "elapsed_time": ("Elapsed Time", 500, format_as_seconds),
+    "wall_time": ("Wall Time", 510, format_as_seconds),
+    "lr": ("Learning Rate", 800, format_as_float),
+    "loss_scale": ("Loss Scale", 810, format_as_float),
+    "grad_norm": ("Grad norm", 810, format_as_float),
+    "raw_grad_norm": ("Raw Grad norm", 815, format_as_float),
+    "encoder_mse_scale": ("Encoder MSE loss scale", 850, format_as_float),
+    "batch_size": ("Batch Size", 900, format_as_int),
+    "elements_per_batch": ("Elements per Batch", 900, format_as_int),
+    "elements_per_second": ("Elements per Second", 900, format_as_int),
+    "num_examples": ("Number of Examples", 900, format_as_int),
+    "num_source_elements": ("Number of Source Elements", 900, format_as_int),
+    "num_target_elements": ("Number of Target Elements", 900, format_as_int),
+    "total_num_target_elements": ("Accumulated Target Elements", 920, format_as_int),
+    "gpu_memory_usage": ("GPU memory usage (GiB)", 910, format_as_float),
+    "gpu_peak_memory_usage": ("GPU peak memory usage (GiB)", 910, format_as_float),
+    "ram_percentage": ("RAM usage", 920, format_as_percent),
+    "cpu_percentage": ("CPU usage", 920, format_as_percent),
+    "mean_predicted_embeddings": ("mean_predicted_embeddings", 920, format_as_float),
+    "std_predicted_embeddings": ("std_predicted_embeddings", 920, format_as_float),
+    # fmt: on
+}
+for key in lcm_metric_formatters:
+    register_metric_formatter(key, *lcm_metric_formatters[key], overwrite=True)

lcm/train/mse_lcm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#

lcm/train/mse_lcm/criterion.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+from dataclasses import dataclass
+from typing import Tuple
+import torch
+from fairseq2.logging import get_log_writer
+from torch import Tensor
+from lcm.datasets.batch import EmbeddingsBatch, LCMInput, LCMStyle
+from lcm.models.abstract_lcm import AbstractLCModel
+from lcm.train.criterion import CriterionsFactory
+from lcm.train.lcm.criterion import (
+    LCMCriterion,
+    LCMCriterionConfig,
+    compute_standard_mse,
+)
+from lcm.train.metrics import LossTerm
+logger = get_log_writer(__name__)
+@dataclass
+class ReconstructionCriterionConfig(LCMCriterionConfig):
+    min_context_size: int = 1
+    """minimum context size for next sentence prediction"""
+@CriterionsFactory.register("next_sentence_mse")
+class ReconstructionCriterion(LCMCriterion):
+    """Computes the MSE reconstruction loss for next-sentence prediction"""
+    config: ReconstructionCriterionConfig
+    def __init__(
+        self,
+        config: ReconstructionCriterionConfig,
+        model: AbstractLCModel,
+        style: LCMStyle = LCMStyle.UNSUPERVISED,
+    ):
+        super().__init__(config, model, style)
+        if style is not LCMStyle.SUPERVISED:
+            assert (
+                config.min_context_size is not None and config.min_context_size > 0
+            ), (
+                "For unsupervised pre-training, expecting a min_context_size of at least 1. "
+                f"Received min_context_size={config.min_context_size}. "
+                "Note that we need some context to predict the first position and "
+                "this context can come from a dummy `beginning of document (BOD)` vector. "
+                "With a minimum context size of 1 we ensure that we never ask the model to predict BOD"
+            )
+        self.min_context_size = config.min_context_size
+    def prepare_input_and_mask(
+        self,
+        batch: LCMInput,
+    ) -> Tuple[EmbeddingsBatch, torch.Tensor]:
+        """
+        A method for preparing model inputs and mask for a batch.
+        It will be typically reused by the `__call__`
+        implementations of the subclasses.
+        """
+        input_embeddings = batch.prepare_input(style=self.style)
+        target_mask = batch.prepare_target_mask(
+            input_embeddings,
+            style=self.style,
+            min_context_size=self.config.min_context_size,
+        )
+        return input_embeddings, target_mask
+    def __call__(self, batch: LCMInput) -> LossTerm:
+        """
+        Args:
+            batch is an LCMInput (see lcm.datasets.batch):
+        Returns a LossTerm
+        """
+        # prepare_input_and mask returns embeddings with seqs in B,T,C
+        # and a target mask in B,T,C. Note that the first position is never used as target
+        # (i.e. BOS vector or first sentence in the document) and will always be set to False
+        # in the target mask
+        input_embeddings, target_mask = self.prepare_input_and_mask(batch)
+        if self.normalize_in_criterion:
+            # the input to the model will be normalize and
+            # so is the target used for loss computation
+            input_embeddings = input_embeddings.normalize_seqs(self.sonar_normalizer)
+        # Predict model outputs
+        output_embeddings = self.model(input_embeddings)
+        # Prepare predictions and targets:
+        # Shift the input to remove the first position.
+        # Shifted seqs from input_embeddings are used as ground truth target embeddings
+        target_seqs = input_embeddings.seqs[:, 1:].contiguous()
+        batch_size, _, sonar_dim = target_seqs.size()
+        # shift and flatten
+        target_mask = target_mask[:, 1:].reshape(-1)
+        # i.e.  s2, s3, s4, s5
+        # Trim the last position.
+        # output_seqs represent contextualized embeddings / predictions for the next sentence
+        # This shifting/trimming allows us to predict `s_t` conditioned on `s_{<t}`
+        predicted_seqs = output_embeddings.seqs[:, :-1].contiguous()
+        # i.e.  s<=1, s<=2, s<=3, s<=4
+        # only measure distance over `target_mask = True` positions
+        flattened_predictions = predicted_seqs.view(-1, sonar_dim)[target_mask]
+        flattened_target = target_seqs.view(-1, sonar_dim)[target_mask]
+        # Cast features to float32 before computing the loss:
+        reconstruction_loss, mse_loss = self.compute_loss(
+            flattened_predictions.float(), flattened_target.float()
+        )
+        num_target_elements = target_mask.sum()
+        if self.reduction == "sum" or num_target_elements == 0:
+            reduced_reconstruction_loss = reconstruction_loss.sum()
+            mse_loss = mse_loss.sum()
+        elif self.reduction == "mean":
+            reduced_reconstruction_loss = reconstruction_loss.mean()
+            mse_loss = mse_loss.mean()
+        final_loss = reduced_reconstruction_loss
+        # Loss summands for records
+        summands = {
+            "mse_loss": (mse_loss.item(), None),
+            "reconstruction_loss": (reduced_reconstruction_loss.item(), None),
+        }
+        return LossTerm(
+            value=final_loss,
+            batch_size=batch_size,
+            num_target_elements=num_target_elements.item(),
+            summands=summands,
+        )
+    def compute_loss(
+        self, flattened_predictions, flattened_target
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Computes the following loss terms:
+            1. The Reconstruction loss we want to optimize as well as:
+                2. RMSE loss (for tracking) (in this parent class, RMSE=Reconstruction loss)
+            Returns reconstruction_loss, mse_loss
+        """
+        reconstruction_loss, _ = compute_standard_mse(
+            flattened_predictions, flattened_target
+        )
+        if self.config.compute_rmse:
+            epsilon = 1e-5
+            reconstruction_loss = torch.sqrt(reconstruction_loss + epsilon)
+        return reconstruction_loss, reconstruction_loss
+@CriterionsFactory.register("target_mse")
+class TargetMSECriterion(ReconstructionCriterion):
+    """Computes the LCM training objective given source/target pairs"""
+    def __init__(
+        self,
+        config: ReconstructionCriterionConfig,
+        model: AbstractLCModel,
+        style: LCMStyle = LCMStyle.SUPERVISED,
+    ):
+        super().__init__(config, model, style)

lcm/train/optim.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+from typing import Tuple
+from fairseq2.logging import get_log_writer
+from fairseq2.optim.lr_scheduler import (
+    AbstractLRScheduler,
+    CosineAnnealingLR,
+    MyleLR,
+    NoopLR,
+    PolynomialDecayLR,
+    TriStageLR,
+)
+from torch.optim import Optimizer
+logger = get_log_writer(__name__)
+def build_lr_scheduler(
+    optimizer: Optimizer,
+    lr: float,
+    warmup_steps: int,
+    start_lr: float = 1e-7,
+    final_lr: float = 1e-5,
+    max_steps: int = 10_000,
+    stage_ratio: Tuple[float, ...] = (0.1, 0.4, 0.5),
+    schedule: str = "myle",
+) -> AbstractLRScheduler:
+    assert schedule in [
+        "noop",
+        "myle",
+        "cosine",
+        "wsd",
+        "polynomial",
+    ], (
+        f"Cannot recognize the learing rate schedule {schedule}, only noop, myle, cosine and wsd are supported"
+    )
+    assert lr > 0, "The learning reate should be strictly positive"
+    lr_scheduler: AbstractLRScheduler
+    if schedule == "noop":
+        lr_scheduler = NoopLR(optimizer)
+    elif schedule == "myle":
+        lr_scheduler = MyleLR(
+            optimizer,
+            num_warmup_steps=warmup_steps,
+            start_lr=[start_lr],
+        )
+    elif schedule == "cosine":
+        lr_scheduler = CosineAnnealingLR(
+            optimizer,
+            cycle_len=max_steps - warmup_steps + 1,
+            num_warmup_steps=warmup_steps,
+            start_lr=[start_lr],
+            final_lr=[final_lr],
+            cycle_mul=1.0,
+            lr_mul=1.0,
+        )
+    elif schedule == "wsd":
+        assert lr > start_lr, (
+            f"the starting learning rate {start_lr} should be lesser than the main lr {lr}"
+        )
+        start_lr_scale = start_lr / lr
+        assert lr > final_lr, (
+            f"the final learning rate {final_lr} should be lesser than the main lr {lr}"
+        )
+        final_lr_scale = final_lr / lr
+        lr_scheduler = TriStageLR(
+            optimizer,
+            max_steps,
+            stage_ratio=stage_ratio,  # type: ignore
+            start_lr_scale=start_lr_scale,
+            final_lr_scale=final_lr_scale,
+        )
+    elif schedule == "polynomial":
+        lr_scheduler = PolynomialDecayLR(
+            optimizer,
+            max_steps,
+            warmup_steps,
+            power=200,
+            start_lr=start_lr,
+            final_lr=final_lr,
+        )
+    return lr_scheduler

lcm/train/step_sampler.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+from dataclasses import dataclass
+from typing import Literal, Optional
+import torch
+import torch.distributions as D
+from fairseq2.logging import get_log_writer
+from torch import Tensor
+from lcm.nn.schedulers import DDIMScheduler
+SUPPORTED_SAMPLERS = Literal["uniform", "beta"]
+SUPPORTED_WEIGHTINGS = Literal["none", "clamp_snr"]
+logger = get_log_writer(__name__)
+def beta_function(a, b):
+    result = torch.exp(torch.lgamma(a) + torch.lgamma(b) - torch.lgamma(a + b))
+    return result
+@dataclass
+class StepsSamplerConfig:
+    sampling: SUPPORTED_SAMPLERS = "uniform"
+    weighting: SUPPORTED_WEIGHTINGS = "none"
+    beta_a: float = 0.8
+    beta_b: float = 1
+    max_gamma: float = 5.0
+    min_gamma: float = 0
+class StepsSampler(object):
+    def __init__(
+        self,
+        config: StepsSamplerConfig,
+        noise_scheduler: DDIMScheduler,
+    ):
+        num_diffusion_train_steps = noise_scheduler.num_diffusion_train_steps
+        weights: Optional[Tensor] = None
+        if config.sampling == "uniform":
+            weights = torch.ones(
+                num_diffusion_train_steps,
+            )
+        elif config.sampling == "beta":
+            # As motivated in https://www.ecva.net/papers/eccv_2024/papers_ECCV/papers/00328.pdf
+            a = torch.tensor([config.beta_a])
+            b = torch.tensor([config.beta_b])
+            # a=1, b=1 -> uniform
+            # The paper empirically chooses b=1, a=0.8 < 1
+            steps = (
+                torch.arange(1, num_diffusion_train_steps + 1)
+                / num_diffusion_train_steps
+            )
+            weights = (
+                1 / beta_function(a, b) * (steps ** (a - 1)) * ((1 - steps) ** (b - 1))
+            )
+        assert weights is not None, "The sampling weights were not properly set!"
+        logger.info(f"Training with sampling weights={weights}")
+        self.distrib = D.Categorical(
+            probs=weights / weights.sum(),
+        )
+        # setup weights for scaling:
+        if config.weighting == "none":
+            self.gamma_per_step = None
+        elif config.weighting == "clamp_snr":
+            # Min-SNR scheme from
+            # https://arxiv.org/abs/2303.09556
+            snrs = noise_scheduler.get_snrs()
+            # gamma(t) = min(max_gamma, snr(t))
+            self.gamma_per_step = torch.clamp(
+                snrs, max=config.max_gamma, min=config.min_gamma
+            )
+        logger.info(f"Training with Gamma={self.gamma_per_step}")
+    @property
+    def _training_weights(self) -> Tensor:
+        return self.distrib.probs
+    def sample(self, size: torch.Size, device: torch.device):
+        samples = self.distrib.sample(size).to(device)
+        # print('Samples', samples)
+        # print('Counts:', torch.bincount(samples.flatten()))
+        return samples
+    def get_loss_scales(self, steps):
+        if self.gamma_per_step is None:
+            return None
+        # If we're using constant Gamma=1 (returning None), then the sum of
+        # the loss scales is steps.numel(), to match the total mass,
+        # we normalize the scales to sum to steps.numel()
+        gamma = self.gamma_per_step.to(steps.device)[steps]
+        gamma = gamma / gamma.sum() * steps.numel()
+        return gamma

lcm/train/trainer.py ADDED Viewed

	@@ -0,0 +1,1422 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+import gc
+import logging
+import os
+import sys
+from abc import abstractmethod
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass, field
+from functools import cached_property
+from itertools import count
+from pathlib import Path
+from pprint import pformat
+from typing import (
+    Any,
+    ContextManager,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Tuple,
+)
+import torch
+import yaml
+from fairseq2.assets import AssetCard, AssetCardFieldNotFoundError
+from fairseq2.checkpoint import FileCheckpointManager
+from fairseq2.gang import FakeGang, Gang, ReduceOperation, all_sum
+from fairseq2.logging import get_log_writer
+from fairseq2.metrics import (
+    LogMetricRecorder,
+    MetricBag,
+    MetricRecorder,
+    TensorBoardRecorder,
+    record_metrics,
+)
+from fairseq2.nn.ddp import to_ddp
+from fairseq2.nn.fsdp import to_fsdp
+from fairseq2.nn.utils.gradient import (
+    check_gradient_norms,
+    clip_gradient_norm,
+    scale_gradients,
+)
+from fairseq2.nn.utils.module import (
+    _get_named_modules,
+    freeze_parameters,
+    to_device,
+)
+from fairseq2.optim import AdamW, DynamicLossScaler
+from fairseq2.optim.lr_scheduler import AbstractLRScheduler, get_effective_lr
+from fairseq2.recipes.utils.log import log_model
+from fairseq2.utils.profiler import Profiler, Stopwatch
+from fairseq2.utils.rng import RngBag
+from fairseq2.utils.state import StatefulObjectBag
+from omegaconf import MISSING
+from stopes.core import Requirements
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel as FSDP,
+)
+from torch.nn import Module
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
+from torch.profiler import record_function
+from torcheval.metrics import Mean
+from lcm.datasets.configs import DataLoadingConfig, ValidationDataLoadingConfig
+from lcm.datasets.dataloading import ds_name
+from lcm.train.metrics import (
+    LCMWandBRecorder,
+    flatten_dict,
+)
+from lcm.train.optim import build_lr_scheduler
+from lcm.utils.data_utils import update_dataclass
+from lcm.utils.distributed import (
+    SUPPORTED_FSDP_MEMORY_POLICIES,
+    SUPPORTED_FSDP_WRAP_POLICIES,
+    get_fsdp_memory_policy,
+    get_fsdp_wrap_policy,
+    init_process_group,
+)
+from lcm.utils.logging import (
+    log_env_variables,
+    setup_additional_logging,
+)
+logger = get_log_writer(__name__)
+@dataclass
+class TrainingConfig:
+    """Holds the configuration of a training job."""
+    training_data: Any = MISSING
+    """The datasets to train with."""
+    validation_data: Any = MISSING
+    """The datasets to validate on."""
+    model_arch: Optional[str] = None
+    """Starting architecture for the model to train"""
+    model_arch_overrides: Optional[Dict] = None
+    """Dict of parameters to overwrite in `model_arch`"""
+    model_config_or_name: Optional[Any] = None
+    """The model configuration or name to train.
+        This option cannot be paired with model_arch + model_arch_overrides
+        If provided, this option supersedes model_arch + model_arch_overrides
+    """
+    output_dir: Path = MISSING
+    """The output directory to store checkpoints and logs."""
+    log_folder: Optional[Path] = None
+    """The executor's log directory where stdout/stderr will be redirected.
+        We will use this directory to optionally enable ATEN and NCCL
+        logging (if debug is True) """
+    tb_dir: Optional[Path] = None
+    """The output directory to store tensorbaord logs"""
+    # defaults to "uncategorized"
+    wandb_project: Optional[str] = None
+    wandb_run_name: Optional[str] = None
+    wandb_entity: Optional[str] = None
+    requirements: Requirements = field(
+        default_factory=lambda: Requirements(
+            nodes=1,
+            tasks_per_node=8,
+            gpus_per_node=8,
+            cpus_per_task=8,
+            mem_gb=256,
+            timeout_min=3 * 24 * 60,
+            constraint="volta32gb",
+        )
+    )
+    """The scheduling requirements for this trainer"""
+    data_loading_config: DataLoadingConfig = MISSING
+    validation_data_loading_config: ValidationDataLoadingConfig = field(
+        default_factory=lambda: ValidationDataLoadingConfig()
+    )
+    criterion: Any = MISSING
+    dtype: str = "torch.float32"
+    """The data type of the model."""
+    lr_schedule: str = "myle"
+    """The learning rate schedule out of
+        `noop`: no learning rate schedule, just use the initial learning rate,
+        `myle`: inv-sqrt as implemented in Fairseq,
+        `cosine` cosine annealing schedule,
+        `wsd` for  Warmup-Stable-Decay (WSD) or tri-stage """
+    lr: float = 0.004
+    """The initial (post-warm-up) learning rate for AdamW."""
+    start_lr: float = 1e-7
+    """The initial warmup learning rate."""
+    final_lr: float = 1e-5
+    """The final learning rate."""
+    lr_stage_ratios: List[float] = field(default_factory=lambda: [0.1, 0.4, 0.5])
+    """The ratios of the wsd (tri-stage) learning rate scheduler."""
+    num_lr_warmup_steps: int = 800
+    """The number of warm-up steps for the learning rate."""
+    weight_decay: float = 0.1
+    """The weight decay coefficient of AdamW (PyTorch default: 1e-2, Fs2 default: 0.0)."""
+    adam_betas: List[float] = field(default_factory=lambda: [0.9, 0.98])
+    """The beta coefficients of AdamW used for computing running averages of gradient and its square."""
+    adam_eps: float = 1e-6
+    """The term added to the denominator in AdamW to improve numerical stability.
+        Default in FS2 and PyTorch is 1e-8. Previous hard coded value in our trainer is 1e-6"""
+    use_optimizer_in_fp32: bool = True
+    """if True, the optimizer (AdamW) will be initialized with `use_fp32 = True`
+        i.e. we will store the optimizer state in single precision and convert
+        gradients on-the-fly to single precision for numerical stability"""
+    max_steps: int = 10_000
+    """The maximum number of training steps."""
+    max_grad_norm: float = 1000
+    """Maximal gradient norm, for gradient clipping.
+       gradients are multiplied by `torch.clamp(max_norm / (total_norm + 1e-6), max=1.0)`
+       if max_norm is arbitrarily large, then we'll only report gradients norm
+    """
+    turn_off_grad_normalization: bool = False
+    """If ``True``, Turn off gradient normalization"""
+    gradient_accumulation: int = 1
+    """The number of steps to accumulate gradients before an optimizer update."""
+    validate_every_n_steps: int = 5000
+    """The number of steps after which to validate the model."""
+    checkpoint_every_n_steps: int = 5000
+    """The number of steps after which to checkpoint."""
+    keep_last_n_checkpoints: int = -1
+    """The number of checkpoints to keep on disk."""
+    save_model_every_n_steps: int = 5000
+    """The number of steps after which to save a consolidated version of the model."""
+    preserve_consolidated_models: bool = False
+    """If `True`, only pt files excluding ones starting with `mdoel` will be deleted from the step checkpoint directory."""
+    publish_metrics_every_n_steps: int = 1
+    """The number of steps after which to publish training metrics."""
+    gc_every_n_steps: int = 1000
+    """The frequency of steps at which we collect garbage with `gc.collect()`."""
+    seed: int = 2
+    """The RNG seed to use while starting the job."""
+    debug: bool = False
+    """If ``True``, runs the trainer in debug mode"""
+    profile: bool = False
+    """If ``True``, runs the PyTorch profiler at the beginning of the training."""
+    profiler_skip_first: int = 200
+    profiler_active: int = 3
+    """If profiling (``profile = True``), The profiler will skip the first ``skip_first`` steps, then do the active recording for the next ``active`` steps
+    If planning to visualize the trace with tensorbaord, then ``active`` should be small (less than 10 steps), otherwise tb won't load!
+    """
+    loss_scaler_init_scale: float = 2.0**15
+    """The initial scale for the gradient scaler, fairseq2's default is 2.0**15"""
+    loss_scaler_scale_window: Optional[int] = None
+    """The number of consecutive optimizer steps without inf/NaN gradients that must occur for the scale to be updated"""
+    use_fsdp: bool = True
+    """If ``True``, uses FSDP instead of DDP."""
+    use_autocast: bool = False
+    """If ``True``, wrap the forward pass in AMP autocast context.
+        autocast is only needed if training with mixed precision.
+        If training fails without it, check if some module with its weights is not properly cast
+    """
+    fsdp_wrap_granularity: SUPPORTED_FSDP_WRAP_POLICIES = "model"
+    """The granularity at which to wrap the model."""
+    fsdp_memory_policy: SUPPORTED_FSDP_MEMORY_POLICIES = "standard"
+    """The FSDP memory policy."""
+    fsdp_fp32_reduce: bool = False
+    """ If ``True``, the gradients will be reduced in full precision even when dtype is `torch.float16`"""
+    use_submitit: bool = True
+    """If ``True``, setup the environment ti use submitit."""
+    fake_gang_device: Optional[str] = None
+    """If non-empty, the trainer will be set locally on a device, instead of distributed training."""
+    experiment_name: Optional[str] = None
+    """experiment name for job trackin, if None default to StopesModule naming"""
+    raise_oom: bool = False
+    """If ``True``, raise OOM errors when they occur, if ``False`` give it another try."""
+    raise_nan_or_inf: bool = False
+    """If ``True``, raise FloatingPointError with Nan/Inf losses, if ``False`` give it another try."""
+    max_ooms: int = 10
+    """If ```raise_oom`` is False, how many OOMs we can tolerate per rank before raising an error."""
+    max_nans_or_infs: int = 10
+    """If ```raise_nan_or_inf`` is False, how many Nan/Infs we can tolerate per rank before raising an error."""
+    freeze_modules: Optional[List[str]] = None
+    """Name of modules in the model to be frozen when training/finetuning"""
+    freezing_strategy: Literal["none", "modules", "ffn", "ffn-adaln", "adaln"] = "none"
+    """
+    Freezing strategy to follow. Options are:
+        1. none: Nothing will be frozen (default)
+        2. modules: A list of modules to freeze will be read from `freeze_modules`
+        3. ffn: All ffn sub-modules will be frozen
+        4. ffn-adaln: all FFN and Adaln sub-modules will be frozen.
+    """
+class Trainer(StatefulObjectBag):
+    config: TrainingConfig
+    model: Module
+    training_data_loader: Any
+    validation_data_loader: Optional[Any]
+    gang: Gang
+    optimizer: Optimizer
+    loss_scaler: DynamicLossScaler
+    lr_scheduler: AbstractLRScheduler
+    rng_bag: RngBag
+    step_nr: int
+    train_metric_bag: MetricBag
+    valid_metric_bag: Mapping[str, MetricBag]
+    metric_recorders: List[MetricRecorder]
+    profiler: Profiler
+    stopwatch: Stopwatch
+    criterion: Any
+    card_metdata: Dict
+    _train_step_time: float
+    _valid_step_time: float
+    def __init__(
+        self,
+        config: TrainingConfig,
+        model: Module,
+        training_data_loader: Any,
+        validation_data_loader: Optional[Any],
+        gang: Gang,
+        checkpoint_manager: FileCheckpointManager,
+        rng_bag: RngBag,
+        stopwatch: Stopwatch,
+        card_metadata: Dict,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        if self.config.debug:
+            logger._logger.setLevel(logging.DEBUG)
+            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+        self.card_metadata = card_metadata
+        self.dtype = eval(config.dtype)
+        self.model = model
+        self.training_data_loader = training_data_loader
+        # Skip saving and loading the state of validation dataloader
+        self.register_non_stateful("validation_data_loader", validation_data_loader)
+        self.gang = gang
+        self.rng_bag = rng_bag
+        self.step_nr = 1
+        self.current_run_steps = 0
+        self.checkpoint_manager = checkpoint_manager
+        tb_dir = config.tb_dir or config.output_dir.joinpath("tb")
+        self.metric_recorders = [LogMetricRecorder(logger)]
+        if gang.rank == 0:
+            self.metric_recorders.append(TensorBoardRecorder(tb_dir))
+            self.metric_recorders.append(
+                LCMWandBRecorder(
+                    name=config.wandb_run_name,
+                    project=config.wandb_project or "uncategorized",
+                    output_dir=config.output_dir / "wandb",
+                    config=self._tb_flat_config,
+                )
+            )
+        self.profiler = Profiler(
+            skip_first=config.profiler_skip_first,
+            active=config.profiler_active,
+            log_dir=tb_dir,
+            gang=gang,
+            enabled=config.profile,
+        )
+        self.stopwatch = stopwatch
+        self._train_step_time = 0.0
+        self._valid_step_time = 0.0
+        self.criterion = None  # type: ignore
+        self.loss_scaler = None  # type: ignore
+    @property
+    def is_fsdp(self) -> bool:
+        return isinstance(self.model, FSDP)
+    @property
+    def is_ddp(self) -> bool:
+        return isinstance(self.model, DDP)
+    def setup(self) -> None:
+        self.criterion = self.setup_criterion()
+        self.setup_metric_bags()
+        # Add the grad_norm metric to the training metric bag
+        self.train_metric_bag.register_metric(
+            "grad_norm", Mean(device=self.gang.device), persistent=False
+        )
+        self.train_metric_bag.register_metric(
+            "raw_grad_norm", Mean(device=self.gang.device), persistent=False
+        )
+        self.setup_optimizer_and_lr_schedule()
+    def setup_optimizer_and_lr_schedule(self):
+        optimizer = AdamW(
+            self.model.parameters(),
+            lr=self.config.lr,
+            betas=tuple(self.config.adam_betas),  # type: ignore
+            eps=self.config.adam_eps,
+            use_fp32=self.config.use_optimizer_in_fp32,
+            weight_decay=self.config.weight_decay,
+        )
+        logger.info(
+            (
+                f"Setting up AdamW optimizer with betas={self.config.adam_betas}, "
+                f"base lr={self.config.lr} and weight decay={self.config.weight_decay} "
+                f"and use_fp32={self.config.use_optimizer_in_fp32}"
+            )
+        )
+        self.register_stateful("optimizer", optimizer)
+        self.loss_scaler = DynamicLossScaler(
+            optimizer,
+            gang=self.gang,
+            init_scale=self.config.loss_scaler_init_scale,
+            min_scale=0.0001,
+            scale_window=self.config.loss_scaler_scale_window,
+            enabled=self.dtype == torch.float16,
+        )
+        if self.loss_scaler.is_enabled:
+            logger.info(
+                f"Initializing DynamicLossScaler with init_scale={self.config.loss_scaler_init_scale}"
+            )
+        lr_scheduler = build_lr_scheduler(
+            optimizer=self.optimizer,
+            schedule=self.config.lr_schedule,
+            lr=self.config.lr,
+            warmup_steps=self.config.num_lr_warmup_steps,
+            start_lr=self.config.start_lr,
+            final_lr=self.config.final_lr,
+            max_steps=self.config.max_steps,
+            stage_ratio=tuple(self.config.lr_stage_ratios),
+        )
+        # Saving the lr_scheduler as well to properly resume training
+        self.register_stateful("lr_scheduler", lr_scheduler)
+    @abstractmethod
+    def setup_criterion(self):
+        """Define a criterion (loss / objective function to optimize)"""
+    def setup_metric_bags(self):
+        """Setup metric bags for tracking"""
+        self.train_metric_bag = MetricBag(self.gang)
+        self.register_non_stateful(
+            "valid_metric_bag",
+            {
+                ds_name(dataset): MetricBag(self.gang)
+                for dataset in self.config.validation_data
+            },
+        )
+    def checkpoint_and_raise(self, exc) -> None:
+        # Checkpoint before exiting
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        logger.warning(f"R{self.gang.rank} checkpoint_and_raise - error={exc}")
+        if self.current_run_steps > 100:
+            # avoid checkpoining for early failures
+            self._checkpoint(crash=exc)
+        raise exc
+    @cached_property
+    def _tb_flat_config(self):
+        """
+        Prepare the flat config that will be used as HParams
+        to record training metadata, namely config and environment hashes.
+        """
+        dict_config = flatten_dict(asdict(self.config))
+        # Merge the data lists:
+        def get_data_signature(dataset):
+            return ":".join(
+                map(str, (dataset["name"], dataset["weight"], dataset["filters"]))
+            )
+        dict_config["training_data"] = "+".join(
+            get_data_signature(dataset) for dataset in dict_config["training_data"]
+        )
+        dict_config["validation_data"] = "+".join(
+            get_data_signature(dataset) for dataset in dict_config["validation_data"]
+        )
+        # value should be one of int, float, str, bool, or torch.Tensor
+        allowed_types = (int, float, str, bool, torch.Tensor)
+        config_keys = list(dict_config)
+        for k in config_keys:
+            if not isinstance(dict_config[k], allowed_types):
+                del dict_config[k]
+        return dict_config
+    def run(self) -> None:
+        """Run the trainer for up to `max_steps`"""
+        logger.info(f"Running training on {self.gang.size} device(s).")
+        data_iter = self.training_data_loader.iterate_batches()
+        logger.info(
+            f"R{self.gang.rank} - waiting for all ranks to prepare a data iterator!"
+        )
+        self.gang.barrier()
+        # These counters are rank-specific
+        ooms, nans_or_infs = 0, 0
+        # TODO: validate before training
+        # logger.info(f"Starting with validation at step={self.step_nr}")
+        # self._validate()
+        with self.profiler:
+            while self.step_nr <= self.config.max_steps:
+                with record_function(f"step_{self.step_nr}"):
+                    try:
+                        # Main training step: forward -> backward -> optimizer.step -> log
+                        stepped = self._train_step(data_iter)
+                    except RuntimeError as e:
+                        if "out of memory" in str(e):
+                            self._log_oom(e)
+                            ooms += 1
+                            if self.config.raise_oom or ooms > self.config.max_ooms:
+                                # Previous behaviour, no retries but still checkpointing
+                                self.checkpoint_and_raise(e)
+                            logger.warning(
+                                f"Attempting to recover from OOM on R{self.gang.rank} (OOMS={ooms})"
+                            )
+                            stepped = True
+                            # reset optimizer
+                            self.optimizer.zero_grad(set_to_none=True)
+                            # rollback updates
+                            self.train_metric_bag.rollback_updates()
+                            # Empty CUDA cache before trying again
+                            if torch.cuda.is_available():
+                                torch.cuda.empty_cache()
+                        else:
+                            # Other RuntimeErrors
+                            self.checkpoint_and_raise(e)
+                    except FloatingPointError as e:
+                        if "Losses are Nan/Inf" in str(e):
+                            self._log_nan_loss(e)
+                            nans_or_infs += 1
+                            if (
+                                self.config.raise_nan_or_inf
+                                or nans_or_infs > self.config.max_nans_or_infs
+                            ):
+                                self.checkpoint_and_raise(e)
+                            logger.warning(
+                                f"Attempting to recover from NaN/Inf loss on R{self.gang.rank} (NaNs/Infs={nans_or_infs})"
+                            )
+                            stepped = True
+                            # reset optimizer
+                            self.optimizer.zero_grad(set_to_none=True)
+                            # rollback updates
+                            self.train_metric_bag.rollback_updates()
+                        else:
+                            # Other FloatingPointErrors
+                            self.checkpoint_and_raise(e)
+                    except Exception as e:
+                        self.checkpoint_and_raise(e)
+                if stepped:
+                    if self._should_publish_train_metrics():
+                        self._publish_train_metrics()
+                    if self._should_checkpoint():
+                        self._checkpoint()
+                    if self._should_validate():
+                        self._validate()
+                    if self._should_collect_garbage():
+                        self._collect_garbage()
+                    self.profiler.step()
+                    self.step_nr += 1
+                    self.current_run_steps += 1
+                else:
+                    logger.info(f"R{self.gang.rank} - Resetting the datapipeline")
+                    self.training_data_loader.pipeline.reset()
+                    logger.info(f"R{self.gang.rank} - Done resetting the datapipeline")
+                    data_iter = self.training_data_loader.iterate_batches()
+        self._save_model_card_for_last_checkpoint(to_checkpoint_dir=False)
+        logger.info(f"Finished training after {self.step_nr - 1} step(s).")
+        self.gang.close()
+    def restore(self) -> None:
+        logger.info("Attempting to load last checkpoint.")
+        step_nr, checkpoint = self.checkpoint_manager.load_last_checkpoint()
+        logger.info(f"Checkpoint loaded, restoring training from step {step_nr}.")
+        self.load_state_dict(checkpoint)
+        self.gang.barrier()
+        logger.info("Training restored, resuming.")
+        self.step_nr = step_nr + 1
+    def _maybe_with_autocast(self) -> ContextManager[None]:
+        # autocast is only needed if training with mixed precision.
+        # If training fails without it, check if some module with its weights
+        # is not properly cast
+        if self.config.use_autocast:
+            return torch.autocast(device_type="cuda", dtype=self.dtype)
+        else:
+            return nullcontext()
+    def _train_step(self, data_iter: Iterator) -> bool:
+        step_nr = self.step_nr
+        step_stopwatch = Stopwatch(start=True, device=self.gang.device)
+        stepped = False
+        # We have to retry the step in case of a gradient overflow.
+        while not stepped:
+            batches = []
+            # Collect batches.
+            with record_function(f"step_{step_nr}_data_load"):
+                for _ in range(self.config.gradient_accumulation):
+                    try:
+                        batches.append(next(data_iter))
+                    except StopIteration:
+                        break
+            if len(batches) != self.config.gradient_accumulation:
+                logger.info(
+                    f"R{self.gang.rank} -End of data reached at training step {step_nr}."
+                )
+                return False
+            # create a copy of the current metrics
+            # any update to the metrics from this point will either be committed with `commit_updates`
+            # or ignored with `rollback_updates`
+            self.train_metric_bag.begin_updates()
+            num_targets = 0
+            # Accumulate gradients.
+            for batch_nr, batch in enumerate(batches):
+                with self._maybe_no_sync(batch_nr, len(batches)):
+                    with record_function(f"step_{step_nr}_{batch_nr}_forward"):
+                        # autocast should wrap only the forward pass(es)
+                        # of your network, including the loss computation(s).
+                        # Backward passes under autocast are not recommended.
+                        with self._maybe_with_autocast():
+                            loss = self.criterion(batch)
+                    if not (
+                        torch.isfinite(loss.value).all() or self.loss_scaler.is_enabled
+                    ):
+                        raise FloatingPointError("Losses are Nan/Inf.")
+                    # update metrics
+                    self.train_metric_bag.update([loss])
+                    with record_function(f"step_{step_nr}_{batch_nr}_backward"):
+                        self.loss_scaler.backward(loss.value)
+                    num_targets += loss.num_target_elements
+            # Record and clip gradient norm
+            grad_norm, raw_grad_norm = self.process_gradients(step_nr, num_targets)
+            # Update parameters.
+            with record_function(f"step_{step_nr}_optimizer"):
+                # scale_result: LossScaleResult(old_scale: float, new_scale: float, overflow: bool, min_reached: bool)
+                _, scale_result = self.loss_scaler.run_optimizer_step(step_nr)
+            if scale_result.overflow:
+                # Walk back the metrics update:
+                self.train_metric_bag.rollback_updates()
+                logger.debug(
+                    f"R{self.gang.rank} rolled back update {self.train_metric_bag._original_metrics is None}"
+                )
+                if scale_result.min_reached:
+                    logger.error(f"Loss has started exploding at step {step_nr}. Stopping training.")  # fmt: skip
+                    raise FloatingPointError("The training loss has exploded.")
+                logger.debug(f"Repeating training step {step_nr}.")
+            else:
+                self.lr_scheduler.step()
+                stepped = True
+            # Reset.
+            self.optimizer.zero_grad(set_to_none=True)
+        # Stepped = True:
+        with record_function(f"step_{step_nr}_metrics"):
+            # do something with losses and grad_norm
+            self.train_metric_bag.commit_updates()
+            # gradient norm is common to workers
+            self.train_metric_bag.grad_norm.update(grad_norm)
+            self.train_metric_bag.raw_grad_norm.update(raw_grad_norm)
+            if self.gang.rank == 0:
+                # update elapsed time once
+                self._train_step_time += step_stopwatch.get_elapsed_time()
+        del batches
+        return stepped
+    def _maybe_no_sync(self, batch_nr: int, num_batches: int) -> ContextManager[None]:
+        if batch_nr < num_batches - 1 and self.gang.size > 1:
+            return self.model.no_sync()
+        return nullcontext()
+    def normalize_gradients(self, num_targets: int) -> None:
+        """
+        :param num_target:
+            The number of targets used in loss computation in this process.
+        If reduction = sum:
+            similar to fairseq2's `normalize_gradients`, will normalize the gradients of the model by ``world_size/num_targets``.
+        If reduction = mean:
+            will simply multiply by world size i.e undo DDP/FSDP's default normalization
+        """
+        reduction = self.criterion.reduction
+        if reduction == "sum":
+            total_num_targets = torch.tensor(
+                num_targets, device=self.gang.device, dtype=torch.int64
+            )
+            self.gang.all_reduce(total_num_targets, ReduceOperation.SUM)
+            # Both DDP and FSDP divide gradients by the world size which we also undo.
+            if total_num_targets > 0:
+                grad_scale = self.gang.size / total_num_targets
+            else:
+                # If total_num_targets == 0, gradients will be zeroes anyway
+                grad_scale = self.gang.size
+        else:
+            grad_scale = self.gang.size
+        scale_gradients(self.model, grad_scale)
+    def process_gradients(
+        self, step_nr: int, num_targets: int
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        with record_function(f"step_{self.step_nr}_process_grads"):
+            # Normalize gradients
+            """
+            Normalize and clip the gradients
+            """
+            # this raw grad norm is only used for debugging
+            raw_grad_norm = clip_gradient_norm(
+                self.model,
+                max_norm=None,
+            )
+            if not self.config.turn_off_grad_normalization:
+                self.normalize_gradients(num_targets=num_targets)
+            # undo the GradScaler's scaling before clipping
+            self.loss_scaler.unscale_gradients_()
+            # Clip gradients
+            # If DDP, we use torch.nn.utils.clip_grad_norm_, if FSDP,
+            # we use torch.distributed.fsdp.FullyShardedDataParallel.clip_grad_norm_
+            # this method handles the fact that gradients might be sharded across ranks.
+            grad_norm = clip_gradient_norm(
+                self.model,
+                max_norm=self.config.max_grad_norm,
+            )
+            # Check for gradient consistency across workers:
+            if not check_gradient_norms(grad_norm, self.gang, step_nr):
+                raise FloatingPointError(
+                    f"The gradients are inconsistent between processes at step {step_nr}. Training cannot continue."
+                )
+        return grad_norm, raw_grad_norm
+    def _should_validate(self) -> bool:
+        return self._should_do(self.config.validate_every_n_steps)
+    def _should_collect_garbage(self) -> bool:
+        return self._should_do(self.config.gc_every_n_steps)
+    def _collect_garbage(self):
+        logger.info("Collecting garbage...")
+        gc.collect()
+    @torch.inference_mode()
+    def _validate(self) -> None:
+        gc.collect()
+        torch.cuda.empty_cache()
+        if self.validation_data_loader is None:
+            logger.info("Skip validation as the data loader is empty")
+            return
+        self.model.eval()
+        logger.info(f"Starting validation after step {self.step_nr}.")
+        self.validation_data_loader.pipeline.reset()
+        data_iter = self.validation_data_loader.iterate_batches()
+        data_dummy_iter = self.validation_data_loader.iterate_dummy_batches()
+        logger.info(f"R{self.gang.rank} done creating the validation data iterator")
+        for step_nr in count(start=1):
+            step_stopwatch = Stopwatch(start=True, device=self.gang.device)
+            try:
+                batch = next(data_iter)
+                true_batch = 1
+            except StopIteration:
+                batch = next(data_dummy_iter)
+                true_batch = 0
+            total_nb_batches = all_sum(self.gang, true_batch)
+            if bool(total_nb_batches == 0):
+                break
+            # we apply model for all workers to avoid process groups sync issues
+            loss = self.criterion(batch)
+            if true_batch:
+                self._valid_step_time += step_stopwatch.get_elapsed_time()
+                self.valid_metric_bag[batch.name].update([loss])
+        self._publish_validation_metrics()
+        logger.info(
+            f"R{self.gang.rank} Validation complete in {step_nr} steps, resuming training."
+        )
+        self.model.train()
+    def _should_publish_train_metrics(self) -> bool:
+        return self._should_do(self.config.publish_metrics_every_n_steps)
+    def _set_elements_per_second(
+        self, metric_values: Dict[str, Any], elapsed_time: float
+    ) -> None:
+        try:
+            num_elements = metric_values[self.criterion.throughput_metric_name]
+        except KeyError:
+            return
+        if not isinstance(num_elements, (int, float, torch.Tensor)):
+            return
+        if elapsed_time == 0.0:
+            metric_values["elements_per_second"] = 0.0
+        else:
+            metric_values["elements_per_second"] = num_elements / elapsed_time
+    def _publish_train_metrics(self) -> None:
+        values = self.train_metric_bag.sync_and_compute_metrics()
+        self.train_metric_bag.reset_non_persistent_metrics()
+        # Only rank-0 to record and publish
+        # since sync_and_compute_metrics's recipient rank is 0
+        if self.gang.rank != 0:
+            return
+        assert values is not None
+        values["lr"] = get_effective_lr(self.lr_scheduler)
+        self._set_elements_per_second(values, self._train_step_time)
+        if self.loss_scaler.is_enabled:
+            values["grad_scale"] = self.loss_scaler.get_scale()
+        values["wall_time"] = self.stopwatch.get_elapsed_time()
+        values["elapsed_time"] = self._train_step_time
+        record_metrics(self.metric_recorders, "Train", values, self.step_nr)
+        self._train_step_time = 0.0
+    def _publish_validation_metrics(self) -> None:
+        values = {}
+        for name, metric_bag in self.valid_metric_bag.items():
+            values[name] = metric_bag.sync_and_compute_metrics()
+            metric_bag.reset_non_persistent_metrics()
+        # Only rank-0 to record and publish
+        if self.gang.rank != 0:
+            return
+        for name, val in values.items():
+            assert val is not None
+            self._set_elements_per_second(val, self._valid_step_time)
+            val["elapsed_time"] = self._valid_step_time
+            val["wall_time"] = self.stopwatch.get_elapsed_time()
+            valid_name = f"Valid | {name}"
+            record_metrics(self.metric_recorders, valid_name, val, self.step_nr)
+        # reset timers
+        self._valid_step_time = 0.0
+    def _should_checkpoint(self) -> bool:
+        return self._should_do(self.config.checkpoint_every_n_steps)
+    def _should_save_consolidated_model(self) -> bool:
+        return self.is_fsdp and self._should_do(self.config.save_model_every_n_steps)
+    def _checkpoint(self, crash=None) -> None:
+        logger.info(f"Saving checkpoint at step {self.step_nr}")
+        checkpoint = self.state_dict()
+        metadata = {
+            "config": self.config,
+            "crash": crash,
+        }
+        self.checkpoint_manager.begin_checkpoint(self.step_nr)
+        if self.is_fsdp:
+            replicated_keys = None
+        elif self.is_ddp:
+            # If we do not shard, save the model and the optimizer only on rank 0.
+            replicated_keys = {"model", "optimizer"}
+        else:
+            replicated_keys = {"*"}
+        self.checkpoint_manager.save_state(checkpoint, replicated_keys=replicated_keys)
+        self.checkpoint_manager.save_metadata(metadata)
+        if self._should_save_consolidated_model():
+            self._save_consolidated_model()
+        # Create a model card only after creating model.pt
+        # i.e., regular checkpointing with DDP or after consolidation with FSDP
+        if not self.is_fsdp:
+            self._save_model_card_for_last_checkpoint(to_checkpoint_dir=True)
+        self.checkpoint_manager.commit_checkpoint()
+        # Note that this logic looks at saved directories regardless of
+        # the nature of the checkpointing, consolidated or not
+        if self.config.keep_last_n_checkpoints != -1:
+            self.checkpoint_manager.keep_last_n_checkpoints(
+                self.config.keep_last_n_checkpoints,
+                preserve_model=self.config.preserve_consolidated_models,
+            )
+        logger.info(f"Checkpoint saved by worker @rank={self.gang.rank}")
+    def _save_consolidated_model(self) -> None:
+        logger.info(f"Saving consolidated model at step {self.step_nr}.")
+        self.checkpoint_manager.save_consolidated_fsdp_model(self.model)
+        self._save_model_card_for_last_checkpoint(to_checkpoint_dir=True)
+        logger.info("Consolidated model saved.")
+    def _should_do(self, n_step: int) -> bool:
+        return self.step_nr % n_step == 0
+    def create_model_card_for_last_checkpoint(
+        self, is_final: bool = False, **card_kwargs
+    ) -> Optional[AssetCard]:
+        """Create a model card based on the last saved checkpoint and the model config."""
+        logger.warning(
+            "Could not create a model card with a generic trainer.  Please use a model-specific one."
+        )
+        return None
+    def _save_model_card_for_last_checkpoint(
+        self, to_checkpoint_dir: bool = False
+    ) -> None:
+        """Save the model card for the last checkpoint to the checkpoint directory or the core output directory."""
+        if self.gang.rank != 0:
+            return
+        if to_checkpoint_dir:
+            current_step_nr = self.checkpoint_manager._checkpoint_step_nr
+            output_dir = self.checkpoint_manager._checkpoint_dir.joinpath(
+                f"step_{current_step_nr}.tmp"
+            )
+        else:
+            output_dir = self.config.output_dir
+        card = self.create_model_card_for_last_checkpoint(
+            is_final=not to_checkpoint_dir
+        )
+        if card is not None:
+            card_data = card._metadata  # TODO: use the exposed attribute when available
+            with open(output_dir / "model_card.yaml", "w", encoding="utf-8") as outfile:
+                yaml.dump(card_data, outfile, default_flow_style=False)
+            logger.info(f"Model card saved in {output_dir}")
+    def _log_oom(self, exc):
+        logger.warning(
+            f"OOM: Ran out of memory on R{self.gang.rank} with exception: {exc}"
+        )
+        if torch.cuda.is_available():
+            for device_idx in range(torch.cuda.device_count()):
+                logger.warning(torch.cuda.memory_summary(device=device_idx))
+        sys.stderr.flush()
+    def _log_nan_loss(self, exc):
+        logger.warning(f"We hit a Nan/Inf Loss: raised with exception: {exc}")
+class TrainerBuilder:
+    def __init__(self, config: TrainingConfig):
+        assert config.save_model_every_n_steps % config.checkpoint_every_n_steps == 0, (
+            f"save_model_every_n_steps={config.save_model_every_n_steps} for saving consolidated models should be a multiplier of checkpoint_every_n_steps={config.checkpoint_every_n_steps}"
+        )
+        self.config = config
+        self.stopwatch = Stopwatch(start=True)
+        # In case we train on Ampere or later, use TF32.
+        torch.set_float32_matmul_precision("high")
+        if self.config.fake_gang_device is None:
+            # By default, we work with a process group
+            self.gang = init_process_group(config, logger=logger._logger)
+        else:
+            # For testing purposes, we use a fake gang on the chosen device
+            self.gang = FakeGang(device=torch.device(self.config.fake_gang_device))
+        self.gang_rank = self.gang.rank if self.gang else 0
+        if self.gang.device.type == "cuda":
+            # Setup ATEN and NCCL logging if in debug mode
+            self._setup_additional_logging()
+            # Dump environment variables:
+            log_env_variables(self.gang.device)
+        # A variable to carry fields necessary to build concise model cards
+        self.card_metdata: Dict = {}
+        if self.gang_rank == 0:
+            logger.info(f"Job Config\n{pformat(config)}")
+        self.device = self.gang.device
+        rng_bag = RngBag.from_device_defaults(self.device)
+        # Ensure that each run has deterministic behavior.
+        rng_bag.manual_seed(config.seed)
+        self.rng_bag = rng_bag
+        self.dtype = eval(config.dtype)
+        self.finetune: bool = False
+        self.has_checkpoint: bool = False
+    @property
+    @abstractmethod
+    def model_loader(self):
+        """A fairseq2 ModelLoader"""
+    @property
+    def model_config_loader(self):
+        """A fairseq2 ConfigLoader"""
+        return self.model_loader._config_loader
+    @abstractmethod
+    def load_data(self):
+        """Load training and validation data
+        Returns one loader for training data and one for validation data
+        """
+    def create_model_config(self, set_finetune_flag: bool = False):
+        """
+        Given `model_config_or_name`, `model_arch` and `model_arch_overrides`
+        create the model config dict
+        if `set_finetune_flag` is `True` then the trainer's finetune flag will be set
+        here inferred from the use of `model_config_or_name`
+        """
+        if self.config.model_config_or_name is not None:
+            assert self.config.model_arch is None, (
+                "We cannot set both `model_config_or_name` and `model_arch`"
+            )
+            if isinstance(self.config.model_config_or_name, str):
+                # The config of a registered model i.e. we're finetuning
+                logger.info(
+                    f"Loading pretrained model from {self.config.model_config_or_name}"
+                )
+                model_config = self.model_config_loader(
+                    self.config.model_config_or_name
+                )
+                finetune = True
+                # Metadata for card creation
+                source_card = self.model_config_loader._asset_store.retrieve_card(
+                    self.config.model_config_or_name
+                )
+                try:
+                    arch = source_card.field("model_arch").as_(str)
+                except AssetCardFieldNotFoundError:
+                    arch = None
+                self.card_metadata = {
+                    "model_config": model_config if arch is None else None,
+                    "model_type": model_config.model_type,
+                    "model_arch": arch,
+                }
+            else:
+                # model_config_or_name is a dataclass
+                logger.info(
+                    "Creating a model from the provided config in model_config_or_name"
+                )
+                model_config = self.config.model_config_or_name
+                self.card_metadata = {
+                    "model_config": model_config,
+                    "model_type": model_config.model_type,
+                    "model_arch": None,
+                }
+                finetune = False
+        elif self.config.model_arch is not None:
+            assert (
+                self.config.model_arch in self.model_config_loader._arch_configs.names()
+            ), (
+                f"Could not recognise {self.config.model_arch} as a registered architecture "
+            )
+            logger.info(
+                f"Creating a model from registered arch {self.config.model_arch}"
+            )
+            finetune = False
+            model_config = self.model_config_loader._arch_configs.get(
+                self.config.model_arch
+            )
+            self.card_metadata = {
+                "model_config": None,
+                "model_type": model_config.model_type,
+                "model_arch": self.config.model_arch,
+            }
+        # In all setups we can override some config parameters
+        if self.config.model_arch_overrides is not None:
+            try:
+                update_dataclass(model_config, self.config.model_arch_overrides)
+            except (TypeError, ValueError) as ex:
+                raise ValueError(
+                    "The model_arch_overrides contain one or more invalid keys"
+                ) from ex
+            self.card_metadata["model_arch"] = None
+            self.card_metadata["model_config"] = model_config
+            logger.info(
+                f"Overwriting model config parameters with {self.config.model_arch_overrides}"
+            )
+        if set_finetune_flag:
+            self.finetune = finetune
+        return model_config
+    def create_model(self):
+        """
+        Load the model to be trained.
+        In case other models are developed following a different paradigm, we can create
+        corresponding trainers by overriding `create_model`
+        """
+        logger.info("Initializing model.")
+        model_config = self.create_model_config(set_finetune_flag=True)
+        if self.gang_rank == 0:
+            logger.info(f"Final model config:\n{pformat(model_config)}")
+        model = self.model_loader._factory(
+            model_config,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        # log model before any wrapping:
+        log_model(model, logger)
+        return model
+    def wrap_model_with_ddp(self, model) -> DDP:
+        """Wrap the model with DDP"""
+        try:
+            ddp_model = to_ddp(
+                model,
+                self.gang,
+            )
+        except ValueError:
+            logger.warning(
+                "Using pytorch DDP instead of fairseq's `to_ddp`\
+                - please check fairseq2 after a3de79dcc6a4ea34cde644e15b4056f1a808a6a8"
+            )
+            ddp_model = DDP(model)
+        if self.gang_rank == 0:
+            log_model(ddp_model, logger)
+        return ddp_model
+    def wrap_model_with_fsdp(self, model) -> FSDP:
+        """Wrap the model with FSDP."""
+        wrap_policy, ignored_modules = get_fsdp_wrap_policy(
+            model, wrap_granularity=self.config.fsdp_wrap_granularity
+        )
+        memory_policy = get_fsdp_memory_policy(policy=self.config.fsdp_memory_policy)
+        if self.dtype == torch.float32:
+            mixed_precision_dtype = None
+        else:
+            mixed_precision_dtype = self.dtype
+        skip_init = False
+        broadcast_state = self.finetune and not self.has_checkpoint
+        fp32_reduce = self.config.fsdp_fp32_reduce
+        if self.gang.rank == 0:
+            logger.info(
+                (
+                    f"FSDP init with: \n--- ignored_modules={ignored_modules}"
+                    f"\n--- wrap_policy={wrap_policy}"
+                    f"\n--- mixed_precision_dtype={mixed_precision_dtype}"
+                    f"\n--- skip_init={skip_init}"
+                    f"\n--- broadcast_state (FSDP's sync_module_states)={broadcast_state}"
+                    f"\n--- fp32_reduce={fp32_reduce}"
+                    f"\n--- memory_policy={memory_policy}"
+                )
+            )
+        fsdp_model = to_fsdp(
+            model,
+            self.gang,
+            wrap_policy,
+            mixed_precision_dtype=mixed_precision_dtype,
+            ignored_modules=ignored_modules,
+            fp32_reduce=fp32_reduce,
+            skip_init=skip_init,
+            broadcast_state=broadcast_state,
+            memory_policy=memory_policy,
+        )
+        if self.gang_rank == 0:
+            log_model(fsdp_model, logger)
+        return fsdp_model
+    def maybe_load_model(self, model):
+        """
+        If we are finetuning and we don't have a checkpoint,
+        load the pre-trained model and broadcast it to
+        all gang processes from rank 0.
+        """
+        if not self.has_checkpoint and self.finetune:
+            logger.info(f"Loading for finetuning: {self.config.model_config_or_name}")
+            if self.gang_rank == 0:
+                pretrained_model = self.model_loader(
+                    model_name_or_card=self.config.model_config_or_name,
+                    device=self.gang.device,
+                    dtype=self.dtype,
+                )  # type: ignore[arg-type]
+                try:
+                    model.load_state_dict(
+                        pretrained_model.state_dict(),
+                        strict=True,
+                        assign=False,
+                    )
+                except (KeyError, ValueError) as ex:
+                    raise ValueError(
+                        f"The model state form {self.config.model_config_or_name} "
+                        "cannot be loaded. See nested exception for details."
+                    ) from ex
+            self.gang.barrier()
+            to_device(model, self.gang.device)
+            logger.info(
+                f"Done loading model for finetuning: {self.config.model_config_or_name}"
+            )
+        return model
+    def maybe_freeze_parameters(self, model):
+        assert (self.config.freezing_strategy == "modules") == (
+            self.config.freeze_modules is not None
+        ), (
+            "For the `modules` freezing_strategy, we need a list of `freeze_modules`. "
+            "If `freeze_modules` is provided, make sure to use freezing_strategy=modules"
+        )
+        if self.config.freezing_strategy == "none":
+            return model
+        if self.config.freezing_strategy == "modules":
+            # Optionally freeze the parameters of sub-modules:
+            if self.config.freeze_modules is not None:
+                for module in self.config.freeze_modules:
+                    logger.info(f"... Freezing module={module}")
+                    freeze_parameters(getattr(model, module))
+            return model
+        if self.config.freezing_strategy == "ffn":
+            for name, m in _get_named_modules(model):
+                if "ffn" in name:
+                    logger.info(f"... Freezing module={name}")
+                    freeze_parameters(m)
+            return model
+        if self.config.freezing_strategy == "adaln":
+            for name, m in _get_named_modules(model):
+                if "modulator" in name:
+                    logger.info(f"... Freezing module={name}")
+                    freeze_parameters(m)
+            return model
+        if self.config.freezing_strategy == "ffn-adaln":
+            for name, m in _get_named_modules(model):
+                if "modulator" in name or "ffn" in name:
+                    logger.info(f"... Freezing module={name}")
+                    freeze_parameters(m)
+            return model
+        raise ValueError(f"Unknown freezing stratgey {self.config.freezing_strategy}")
+    def _setup_additional_logging(self):
+        if self.config.debug:
+            assert self.config.log_folder is not None, (
+                "Missing log_folder, \
+            make sure the log_folder is properly set in the training config"
+            )
+            setup_additional_logging(log_folder=self.config.log_folder)
+    @property
+    def use_fsdp(self) -> bool:
+        return self.config.use_fsdp
+    @property
+    def use_ddp(self) -> bool:
+        """
+        Whether DDP should be used.
+        if selg.gang.size == 1: single worker, no parallelism
+        if use_fsdp:  use FSDP instead
+        """
+        return not (self.gang.size == 1 or self.use_fsdp)
+    @abstractmethod
+    def build_trainer(self):
+        """Build the trainer by loading data and
+        setting up the model for training
+        Returns trainer
+        """

lcm/train/two_tower_diffusion_lcm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#

lcm/train/two_tower_diffusion_lcm/criterion.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+#
+from dataclasses import dataclass, field
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+from fairseq2.logging import get_log_writer
+from fairseq2.nn.padding import pad_seqs
+from torch import Tensor
+from lcm.datasets.batch import EmbeddingsBatch, LCMInput, LCMStyle
+from lcm.models.two_tower_diffusion_lcm.builder import TwoTowerDiffusionLCModel
+from lcm.train.criterion import CriterionsFactory
+from lcm.train.lcm.criterion import (
+    LCMCriterion,
+    LCMCriterionConfig,
+    compute_standard_mse,
+)
+from lcm.train.metrics import LossTerm, format_as_float, register_metric_formatter
+from lcm.train.step_sampler import StepsSampler, StepsSamplerConfig
+logger = get_log_writer(__name__)
+@dataclass
+class TowerDiffusionLCMCriterionConfig(LCMCriterionConfig):
+    cf_guidance_probability: float = 0.0
+    """Probability to use classifier-free guidance by dropping conditioning.
+       Note that this requires the model to be set with
+       `trained_with_cf_guidance = True`!
+    """
+    step_sampling: StepsSamplerConfig = field(
+        default_factory=lambda: StepsSamplerConfig()
+    )
+    log_losses_per_timestep_bucket: bool = False
+@CriterionsFactory.register("two_tower_diffusion_next_sent")
+class TwoTowerDiffusionCriterion(LCMCriterion):
+    """Computes the LCM training objective for next-sentence prediction with diffusion"""
+    config: TowerDiffusionLCMCriterionConfig
+    model: TwoTowerDiffusionLCModel
+    def __init__(
+        self,
+        config: TowerDiffusionLCMCriterionConfig,
+        model: TwoTowerDiffusionLCModel,
+        style: LCMStyle = LCMStyle.UNSUPERVISED,
+    ):
+        super().__init__(config, model, style)
+        assert hasattr(self.base_model, "noise_scheduler"), (
+            "Expecting the diffusion model to have a `noise_scheduler`"
+        )
+        self.noise_scheduler = self.base_model.noise_scheduler
+        self.prediction_type = self.noise_scheduler.prediction_type
+        self.trained_with_cf_guidance = self.base_model.config.trained_with_cf_guidance
+        self.cf_guidance_probability = config.cf_guidance_probability
+        assert (
+            bool(self.cf_guidance_probability > 0) == self.trained_with_cf_guidance
+        ), (
+            "Expecting the config's cf_guidance_probabilitya to align with the model's `trained_with_cf_guidance` ",
+            f"Found cf_guidance_probability={config.cf_guidance_probability} and "
+            f"trained_with_cf_guidance={self.trained_with_cf_guidance}",
+        )
+        assert self.normalize_in_criterion, (
+            "We only support `normalize_in_criterion = True` in the diffusion criterions"
+        )
+        self.summands.append("unnormalized_reconstruction_loss")
+        if self.config.log_losses_per_timestep_bucket:
+            # customize if needed
+            self.step_bucketing_boundaries = torch.linspace(
+                0, self.noise_scheduler.num_diffusion_train_steps, 11
+            )
+            self.step_bucketing_labels: List[str] = []
+            for e in range(len(self.step_bucketing_boundaries) - 1):
+                bucket_left = self.step_bucketing_boundaries[e]
+                bucket_right = self.step_bucketing_boundaries[e + 1]
+                self.step_bucketing_labels.append(
+                    f"reconstruction_loss_t{bucket_left:.0f}-{bucket_right:.0f}"
+                )
+            self.summands.extend(self.step_bucketing_labels)
+            for label in self.step_bucketing_labels:
+                register_metric_formatter(
+                    label, label, 1000, format_as_float, overwrite=True
+                )
+        # Step sampler + loss weighter
+        self.step_sampler = StepsSampler(
+            config.step_sampling,
+            noise_scheduler=self.noise_scheduler,
+        )
+    def prepare_input_and_mask(
+        self,
+        batch: LCMInput,
+    ) -> Tuple[EmbeddingsBatch, EmbeddingsBatch, torch.Tensor]:
+        """
+        A method for preparing model inputs and mask for a batch.
+        It will be typically reused by the `__call__`
+        implementations of the subclasses.
+        Returns:
+            - input_batch: context
+            - target_batch: denoiser input
+            - target_mask  mask of positions to compute the loss over
+        """
+        # Prepare the input as in MSE LCM: each sequence is (src, tgt)
+        input_embeddings = batch.prepare_input(style=self.style)
+        # Normalize the embeddings
+        if self.normalize_in_criterion:
+            input_embeddings = input_embeddings.normalize_seqs(self.sonar_normalizer)
+        target_mask = torch.ones(
+            size=input_embeddings.seqs.shape[:-1],
+            dtype=torch.bool,
+            device=input_embeddings.seqs.device,
+        )
+        # Factor in padded positions:
+        if input_embeddings.padding_mask is not None:
+            target_mask &= input_embeddings.padding_mask.materialize()
+        return input_embeddings, input_embeddings.clone(), target_mask
+    def sample_noisy_input_and_targets(self, input_batch, target_mask):
+        """
+        (1)
+        Prepares the noised inputs (latents) by sampling diffusion timesteps and calling
+        on the model's noise_scheduler to add noise accordingly
+        (2) Given the scheduler prediction type, prepares the target that the model will be
+        trained to predict.
+        :param input_bach: EmbeddingsBatch of the ground truth embeddings with seqs in (B, T, C)
+        :param target_mask: Bool tensor in (B, T) where `True` signals that the
+                            model will be asked to predict the position
+        """
+        input_seqs, padding_mask = input_batch.seqs, input_batch.padding_mask
+        timesteps = self.step_sampler.sample(
+            size=input_seqs[..., 0].size(), device=input_seqs.device
+        )
+        # Sample noise
+        noise_seqs = torch.randn_like(input_seqs)
+        # Define target in (B*T, C)
+        sonar_dim = input_seqs.size(-1)
+        if self.prediction_type == "sample":
+            """Predict the clean ground truth embeddings. Default mode"""
+            target = input_seqs.view(-1, sonar_dim)
+        elif self.prediction_type == "epsilon":
+            """Predict the added noise"""
+            target = noise_seqs.view(-1, sonar_dim)
+        elif self.prediction_type == "v_prediction":
+            """Predict an interpolation of the ground truth clean
+            embeddings and the added noise.
+            As introduced in https://arxiv.org/pdf/2305.08891
+            """
+            target = self.noise_scheduler.get_velocity(
+                input_seqs.view(-1, sonar_dim),
+                noise_seqs.view(-1, sonar_dim),
+                timesteps.view(-1),
+            ).clone()
+        else:
+            raise ValueError(
+                "Prediction type should be either: sample, epsilon, v_prediction"
+            )
+        # Add noise
+        # Reshape inputs and noise into in (B*T , C) -> add noise -> reshape back as (B, T, C)
+        noisy_input_seqs = self.noise_scheduler.add_noise(
+            input_seqs.view(-1, sonar_dim),
+            noise_seqs.view(-1, sonar_dim),
+            timesteps.view(-1),
+        ).view(input_seqs.size())
+        # Create sequence batch with diffusion timesteps
+        noisy_input_batch = EmbeddingsBatch(
+            noisy_input_seqs,
+            padding_mask,
+            diffusion_timesteps=timesteps,
+        )
+        return noisy_input_batch, target, target_mask
+    def compute_loss(
+        self, flattened_predictions, flattened_target
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Parameters:
+            flattened_predictions (Tensor): The predictions in (N, C)
+            flattened_target (Tensor): The targets in (N, C)
+        Returns:
+            reconstruction_loss (Tensor): The Reconstruction loss we want to optimize (RMSE, SmoothL1, Huber etc.).
+            plain_reconstruction_loss (Tensor): plain RMSE loss.
+            unnormalized_reconstruction_loss (Tensor): plain RMSE loss between unnormalized features.
+        """
+        reconstruction_loss, plain_reconstruction_loss = compute_standard_mse(
+            flattened_predictions,
+            flattened_target,
+        )
+        unnormalized_reconstruction_loss, _ = compute_standard_mse(
+            flattened_predictions,
+            flattened_target,
+            normalizer=self.sonar_normalizer,
+        )
+        # For backward compatibility with ongoing runs, take the sqrt
+        if self.config.compute_rmse:
+            epsilon = 1e-5
+            reconstruction_loss = torch.sqrt(reconstruction_loss + epsilon)
+            plain_reconstruction_loss = torch.sqrt(plain_reconstruction_loss + epsilon)
+            unnormalized_reconstruction_loss = torch.sqrt(
+                unnormalized_reconstruction_loss + epsilon
+            )
+        return (
+            reconstruction_loss,
+            plain_reconstruction_loss,
+            unnormalized_reconstruction_loss,
+        )
+    @torch.no_grad()
+    def _log_losses_per_step(self, batch_steps, reconstruction_loss):
+        # Aggregate loss terms based on their bucket of diffusion steps for tracking
+        summands = {}
+        if self.config.log_losses_per_timestep_bucket:
+            # Reconstruction_loss in BT,
+            # batch_steps in BT,
+            bucket_index = torch.bucketize(
+                batch_steps, self.step_bucketing_boundaries.to(batch_steps.device)
+            )
+            onehot = F.one_hot(
+                bucket_index,
+                num_classes=self.step_bucketing_boundaries.numel(),
+            )
+            loss_per_step = torch.matmul(onehot.t().float(), reconstruction_loss)
+            count_steps = onehot.sum(dim=0) + 1e-6
+            if self.reduction == "mean":
+                loss_per_step /= count_steps
+            for e, label in enumerate(self.step_bucketing_labels):
+                summands[label] = (
+                    loss_per_step[e].item(),
+                    count_steps[e].long().item(),
+                )
+        return summands
+    def __call__(self, batch: LCMInput) -> LossTerm:
+        """
+        Input batch is LCMInput with:
+            source: List[Tensor]
+            target: Union[None, List[Tensor]]
+        """
+        # Prepare the clean inputs and target mask:
+        input_batch, target_batch, target_mask = self.prepare_input_and_mask(batch)
+        noisy_target_batch, target, target_mask = self.sample_noisy_input_and_targets(
+            target_batch, target_mask
+        )
+        # Encode the context and diffuse:
+        output_batch = self.model(
+            input_batch,
+            noisy_target_batch,
+            cf_guidance_prob=self.cf_guidance_probability,
+        )
+        # Shape B, T, C
+        output_seqs = output_batch.seqs
+        sonar_dim = output_seqs.size(-1)
+        # only measure distance over `target_mask = True` positions
+        target_mask = target_mask.reshape(-1)
+        # The target is basically the doubled ground truth sequence before noising
+        # (with some modification to adjust for the denoiser's prediction type)
+        # contextualized latents (noised inputs preceding the target) e_1, e_2, ...
+        flattened_predictions = output_seqs.view(-1, sonar_dim)[target_mask]
+        # x1, x2, ..., xT
+        # Target is already in B*T, C
+        flattened_target = target[target_mask]
+        # Cast features to float32 before computing the loss:
+        (
+            reconstruction_loss,
+            mse_loss,
+            unnormalized_reconstruction_loss,
+        ) = self.compute_loss(flattened_predictions.float(), flattened_target.float())
+        num_target_elements = target_mask.sum()
+        batch_steps = noisy_target_batch.diffusion_timesteps.view(-1)[target_mask]
+        summands = self._log_losses_per_step(batch_steps, reconstruction_loss)
+        # Get loss scales per timestep (gamma)
+        gammas = self.step_sampler.get_loss_scales(batch_steps)
+        # Weight the loss terms
+        if gammas is not None:
+            reconstruction_loss = torch.mul(reconstruction_loss, gammas)
+        if self.reduction == "sum" or num_target_elements == 0:
+            reduced_reconstruction_loss = reconstruction_loss.sum()
+            mse_loss = mse_loss.sum()
+            unnormalized_reconstruction_loss = unnormalized_reconstruction_loss.sum()
+        elif self.reduction == "mean":
+            reduced_reconstruction_loss = reconstruction_loss.mean()
+            mse_loss = mse_loss.mean()
+            unnormalized_reconstruction_loss = unnormalized_reconstruction_loss.mean()
+        final_loss = reduced_reconstruction_loss
+        # Loss summands for records
+        summands.update(
+            {
+                "mse_loss": (mse_loss.item(), -1),
+                "reconstruction_loss": (reduced_reconstruction_loss.item(), -1),
+                "unnormalized_reconstruction_loss": (
+                    unnormalized_reconstruction_loss.item(),
+                    -1,
+                ),
+            }
+        )
+        return LossTerm(
+            value=final_loss,
+            batch_size=output_seqs.size(0),
+            num_target_elements=num_target_elements.item(),
+            summands=summands,
+        )
+@CriterionsFactory.register("two_tower_diffusion_next_sent_finetuning")
+class DiffusionNextSentFinetuningCriterion(TwoTowerDiffusionCriterion):
+    def __init__(
+        self,
+        config: TowerDiffusionLCMCriterionConfig,
+        model: TwoTowerDiffusionLCModel,
+    ):
+        super().__init__(config, model, LCMStyle.SUPERVISED)
+    def prepare_input_and_mask(
+        self,
+        batch: LCMInput,
+    ) -> Tuple[EmbeddingsBatch, EmbeddingsBatch, torch.Tensor]:
+        """
+        A method for preparing model inputs and mask for a batch.
+        It will be typically reused by the `__call__`
+        implementations of the subclasses.
+        Returns:
+            - input_batch: context
+            - target_batch: denoiser input
+            - target_mask  mask of positions to compute the loss over
+        """
+        # Prepare the input as in MSE LCM
+        input_embeddings = batch.prepare_input(style=self.style)
+        assert input_embeddings.source_lengths is not None, (
+            "Missing source lengths needed for the two-tower supervised fintuning"
+        )
+        target_embeddings = EmbeddingsBatch(*pad_seqs(batch.target))  # type: ignore
+        # Normalize the embeddings
+        if self.normalize_in_criterion:
+            input_embeddings = input_embeddings.normalize_seqs(self.sonar_normalizer)
+            target_embeddings = target_embeddings.normalize_seqs(self.sonar_normalizer)
+        target_mask = torch.ones(
+            size=target_embeddings.shape[:-1],
+            dtype=torch.bool,
+            device=input_embeddings.seqs.device,
+        )
+        # Factor in padded positions:
+        if target_embeddings.padding_mask is not None:
+            target_mask &= target_embeddings.padding_mask.materialize()
+        return input_embeddings, target_embeddings, target_mask

lcm/train/two_tower_diffusion_lcm/trainer.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+# All rights reserved.
+#
+#
+from dataclasses import dataclass, field
+from typing import Union
+from lcm.models.two_tower_diffusion_lcm.builder import TwoTowerDiffusionLCModelConfig
+from lcm.models.two_tower_diffusion_lcm.loader import (
+    load_two_tower_diffusion_lcm_model,
+)
+from lcm.train.lcm.trainer import LCMTrainer, LCMTrainerBuilder, LCMTrainingConfig
+from lcm.train.two_tower_diffusion_lcm.criterion import (
+    TowerDiffusionLCMCriterionConfig,
+)
+@dataclass
+class TwoTowerDiffusionLCMTrainingConfig(LCMTrainingConfig):
+    model_config_or_name: Union[TwoTowerDiffusionLCModelConfig, str, None] = None
+    """The model configuration or name to train."""
+    criterion: TowerDiffusionLCMCriterionConfig = field(  # type: ignore
+        default_factory=lambda: TowerDiffusionLCMCriterionConfig()
+    )
+class DiffusionLCMTrainerBuilder(LCMTrainerBuilder):
+    config: TwoTowerDiffusionLCMTrainingConfig
+    def __init__(self, config: TwoTowerDiffusionLCMTrainingConfig):
+        super().__init__(config)
+    @property
+    def model_loader(self):
+        """A fairseq2 ModelLoader"""
+        return load_two_tower_diffusion_lcm_model
+def prepare_two_tower_diffusion_lcm_trainer(
+    config: TwoTowerDiffusionLCMTrainingConfig,
+) -> LCMTrainer:
+    """Create an LCM Trainer.
+    :param config: The training configuration.
+    """
+    return DiffusionLCMTrainerBuilder(config).build_trainer()

pyproject.toml CHANGED Viewed

@@ -13,6 +13,7 @@ dependencies = [
   "polars>=1.16.0",
   "pyarrow>=16.1.0",
   "retrying>=1.3.4",
   "sentence-splitter>=1.4",
   "sonar-space>=0.3.2",
   "stopes[mono]>=2.2.0",

   "polars>=1.16.0",
   "pyarrow>=16.1.0",
   "retrying>=1.3.4",
+  "safetensors>=0.5.3",
   "sentence-splitter>=1.4",
   "sonar-space>=0.3.2",
   "stopes[mono]>=2.2.0",

scripts/CovertToST.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+from safetensors.torch import save_file
+import os
+# Define the location and files to process
+location = "_LexaLCM_Pre0/Checkpoints/LCM_TwoTower_Pre0/checkpoints/step_250000"
+files = ["model", "rank_0", "metadata"]
+for file in files:
+    pt_path = os.path.join(location, f"{file}.pt")
+    st_path = os.path.join(location, f"{file}.safetensors")
+    try:
+        # Attempt to load the checkpoint with weights_only=True
+        checkpoint = torch.load(pt_path, weights_only=True)
+    except Exception as e:
+        print(f"Warning: Failed to load {pt_path} with weights_only=True due to {e}")
+        print("Attempting to load with weights_only=False (ensure the source is trusted).")
+        try:
+            checkpoint = torch.load(pt_path, weights_only=False)
+        except Exception as e:
+            print(f"Error: Failed to load {pt_path} with weights_only=False due to {e}")
+            continue  # Skip to the next file
+    # Determine the state_dict
+    state_dict = checkpoint.get('model', checkpoint)
+    # Filter out non-tensor entries
+    tensor_state_dict = {k: v for k, v in state_dict.items() if isinstance(v, torch.Tensor)}
+    # Save the filtered state_dict to a .safetensors file
+    save_file(tensor_state_dict, st_path)
+    print(f"Successfully converted {pt_path} to {st_path}")