Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +1 -0
config.json +61 -0
configuration_sedd.py +122 -0
pytorch_model.bin +3 -0
sedd_wrapper.py +289 -0

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Score Entropy Discrete Diffusion (SEDD) medium model for use with inference code in https://github.com/louaaron/Score-Entropy-Discrete-Diffusion. Paper found at arxiv.org/abs/2310.16834

config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+ "ngpus": 8,
+ "tokens": 50257,
+ "training": {
+  "batch_size": 512,
+  "accum": 2,
+  "n_iters": 1300001,
+  "snapshot_freq": 50000,
+  "log_freq": 50,
+  "eval_freq": 100,
+  "snapshot_freq_for_preemption": 10000,
+  "weight": "standard",
+  "snapshot_sampling": true,
+  "ema": 0.9999
+ },
+ "data": {
+  "train": "openwebtext",
+  "valid": "wikitext103",
+  "cache_dir": "data"
+ },
+ "graph": {
+  "type": "absorb"
+ },
+ "noise": {
+  "type": "loglinear",
+  "sigma_min": 0.0001,
+  "sigma_max": 20
+ },
+ "sampling": {
+  "predictor": "euler",
+  "steps": 128,
+  "noise_removal": true
+ },
+ "eval": {
+  "batch_size": 512,
+  "perplexity": true,
+  "perplexity_batch_size": 32
+ },
+ "optim": {
+  "weight_decay": 0,
+  "optimizer": "AdamW",
+  "lr": 0.0003,
+  "beta1": 0.9,
+  "beta2": 0.999,
+  "eps": 1e-08,
+  "warmup": 2500,
+  "grad_clip": 1.0
+ },
+ "model": {
+  "name": "medium",
+  "type": "ddit",
+  "hidden_size": 1024,
+  "cond_dim": 128,
+  "length": 1024,
+  "n_blocks": 24,
+  "n_heads": 16,
+  "scale_by_sigma": true,
+  "dropout": 0.1
+ },
+ "work_dir": "absorb_medium"
+}

configuration_sedd.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from __future__ import annotations
+"""configuration_sedd.py
+====================================
+HuggingFace *Transformers* configuration class for the `SEDD` architecture.
+This mirrors the structure of other community models in 🤗 Transformers so that
+`AutoConfig` can correctly instantiate the model.
+The default values roughly reproduce the "small" setup shipped in
+`configs/model/small.yaml` of this repository.
+"""
+from typing import Any, Dict
+from transformers.configuration_utils import PretrainedConfig
+try:
+    # `omegaconf` is an explicit dependency of the original SEDD implementation.
+    from omegaconf import OmegaConf  # type: ignore
+except ImportError:  # pragma: no cover – users might wish to load a config without installing omegaconf
+    OmegaConf = None  # type: ignore
+__all__ = [
+    "SEDDConfig",
+]
+class SEDDConfig(PretrainedConfig):
+    """Configuration class for the SEDD score-based model.
+    Parameters
+    ----------
+    tokens:
+        Size of the tokenizer vocabulary (default: 50257 – GPT-2 vocab).
+    graph_type:
+        Type of token graph to use ("absorb" matches the reference implementation).
+    model_hidden_size:
+        Dimension of the transformer hidden states.
+    model_cond_dim:
+        Dimension of the conditional embedding for the noise level.
+    model_length:
+        Maximum (fixed) sequence length the model was trained with.
+    model_n_blocks:
+        Number of *DDiT* blocks in the network.
+    model_n_heads:
+        Number of attention heads per *DDiT* block.
+    model_scale_by_sigma:
+        Whether to scale the output logits by the noise level (see
+        `SEDD.forward`).
+    model_dropout:
+        Drop-out probability used throughout the network.
+    tie_word_embeddings:
+        Standard Transformer flag – not used by SEDD but required by the base
+        class. Must be present so that the value is serialised in the resulting
+        JSON file.
+    """
+    model_type: str = "sedd"
+    def __init__(
+        self,
+        *,
+        tokens: int = 50257,
+        # Graph section
+        graph_type: str = "absorb",
+        # Model section
+        model_hidden_size: int = 768,
+        model_cond_dim: int = 128,
+        model_length: int = 1024,
+        model_n_blocks: int = 12,
+        model_n_heads: int = 12,
+        model_scale_by_sigma: bool = True,
+        model_dropout: float = 0.10,
+        # Miscellaneous / HF specific
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ) -> None:
+        # NOTE: `tie_word_embeddings` goes to the base class because
+        # `PretrainedConfig` validates keyword-only signature.
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        # Keep attributes *flat* – matching the style used by most HF models.
+        # -------------------------------------------------------------------
+        self.tokens = tokens
+        self.graph_type = graph_type
+        self.model_hidden_size = model_hidden_size
+        self.model_cond_dim = model_cond_dim
+        self.model_length = model_length
+        self.model_n_blocks = model_n_blocks
+        self.model_n_heads = model_n_heads
+        self.model_scale_by_sigma = model_scale_by_sigma
+        self.model_dropout = model_dropout
+    # ------------------------------------------------------------------
+    # Compatibility helpers
+    # ------------------------------------------------------------------
+    def to_hydra(self):
+        """Convert this *flat* configuration to the nested OmegaConf structure
+        expected by the reference `SEDD` implementation.
+        """
+        if OmegaConf is None:
+            raise RuntimeError("`omegaconf` is required to build a Hydra config")
+        nested: Dict[str, Any] = {
+            "tokens": self.tokens,
+            "graph": {
+                "type": self.graph_type,
+            },
+            "model": {
+                "hidden_size": self.model_hidden_size,
+                "cond_dim": self.model_cond_dim,
+                "length": self.model_length,
+                "n_blocks": self.model_n_blocks,
+                "n_heads": self.model_n_heads,
+                "scale_by_sigma": self.model_scale_by_sigma,
+                "dropout": self.model_dropout,
+            },
+        }
+        return OmegaConf.create(nested)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d93bb0dd1013295a4865848ea546ee3763a5be036cf55ea407e898c0a7a82a33
+size 1698000441

sedd_wrapper.py ADDED Viewed

	@@ -0,0 +1,289 @@

+from __future__ import annotations
+"""sedd_wrapper.py
+=========================================
+This module provides a minimal HuggingFace-compatible wrapper around the
+`SEDD` architecture that is implemented in :pyfile:`model/transformer.py`.
+The wrapper closely follows the design used in the Aero implementation that
+lives in this code-base (see :pyfile:`configuration_aero.py` and
+:pyfile:`modeling_aero.py`).  Concretely we expose three public objects:
+* ``SEDDConfig`` A :class:`transformers.PretrainedConfig` subclass that
+  stores the hyper-parameters needed to instantiate a ``SEDD`` model.
+* ``SEDDModel`` A :class:`transformers.PreTrainedModel` subclass that
+  internally contains an instance of the original ``SEDD`` network and maps
+  from  ``input_ids`` + ``sigma`` to the vocabulary logits.
+* ``SEDDOutput`` A thin :class:`transformers.modeling_outputs.ModelOutput`
+  dataclass that mirrors the usual "logits / loss" structure.
+With this wrapper a trained model checkpoint can be pushed to / loaded from
+🤗 Hub via ``SEDDModel.push_to_hub`` / ``SEDDModel.from_pretrained`` the same
+way as any other ``transformers`` model.
+"""
+from dataclasses import dataclass
+from typing import Optional, Tuple, List, Dict, Any, Union
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+# Original SEDD implementation
+from model.transformer import SEDD as _OrigSEDD
+try:
+    from omegaconf import OmegaConf
+except ImportError:  # pragma: no cover – omegaconf is an explicit dependency of SEDD
+    OmegaConf = None  # type: ignore
+logger = logging.get_logger(__name__)
+###############################################################################
+# Configuration                                                               #
+###############################################################################
+class SEDDConfig(PretrainedConfig):
+    """Configuration class for the SEDD architecture.
+    The defaults reproduce *roughly* the "small" configuration shipped in
+    ``configs/model/small.yaml``.  Additional keys that are present in the
+    original Hydra config but not required for instantiation (e.g. *training*
+    hyper-parameters) are deliberately omitted here – they can still be stored
+    as *extra* fields in the underlying JSON if a user wishes to preserve them.
+    """
+    model_type: str = "sedd"
+    def __init__(
+        self,
+        *,
+        tokens: int = 50257,
+        # graph section
+        graph_type: str = "absorb",
+        # model section (mirrors configs/model/*.yaml)
+        model_hidden_size: int = 768,
+        model_cond_dim: int = 128,
+        model_length: int = 1024,
+        model_n_blocks: int = 12,
+        model_n_heads: int = 12,
+        model_scale_by_sigma: bool = True,
+        model_dropout: float = 0.10,
+        # miscellaneous
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        # Top-level attributes (kept flat for simplicity)
+        self.tokens = tokens
+        self.graph_type = graph_type
+        # Model hyper-parameters
+        self.model_hidden_size = model_hidden_size
+        self.model_cond_dim = model_cond_dim
+        self.model_length = model_length
+        self.model_n_blocks = model_n_blocks
+        self.model_n_heads = model_n_heads
+        self.model_scale_by_sigma = model_scale_by_sigma
+        self.model_dropout = model_dropout
+    # ---------------------------------------------------------------------
+    # Serialization helpers – these optionally bridge to the original Hydra
+    # config structure that the reference implementation expects.
+    # ---------------------------------------------------------------------
+    def to_hydra(self):
+        """Convert this *flat* config to the nested OmegaConf structure that
+        the reference ``SEDD`` implementation expects.
+        """
+        if OmegaConf is None:
+            raise RuntimeError("`omegaconf` is required to build a Hydra config")
+        nested: Dict[str, Any] = {
+            "tokens": self.tokens,
+            "graph": {
+                "type": self.graph_type,
+            },
+            "model": {
+                "hidden_size": self.model_hidden_size,
+                "cond_dim": self.model_cond_dim,
+                "length": self.model_length,
+                "n_blocks": self.model_n_blocks,
+                "n_heads": self.model_n_heads,
+                "scale_by_sigma": self.model_scale_by_sigma,
+                "dropout": self.model_dropout,
+            },
+        }
+        return OmegaConf.create(nested)
+###############################################################################
+# Output container                                                            #
+###############################################################################
+@dataclass
+class SEDDOutput(ModelOutput):
+    """Standard output for :class:`SEDDModel`.
+    Attributes
+    ----------
+    loss:
+        *Optional* scalar returned when ``labels`` are provided.
+    logits:
+        The raw vocabulary logits computed by the model of shape
+        ``(batch_size, sequence_length, vocab_size)``.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor | None = None
+###############################################################################
+# Model                                                                       #
+###############################################################################
+class SEDDModel(PreTrainedModel):
+    """HuggingFace *Transformers* wrapper around the original ``SEDD`` model."""
+    config_class = SEDDConfig
+    base_model_prefix = "score_model"
+    _no_split_modules: List[str] = [
+        "DDiTBlock",  # ensure these blocks are not split when using FSDP/TP
+    ]
+    def __init__(self, config: SEDDConfig):
+        super().__init__(config)
+        # ------------------------------------------------------------------
+        # Instantiate the original SEDD architecture using the Hydra cfg that
+        # the implementation expects.
+        # ------------------------------------------------------------------
+        if OmegaConf is None:
+            raise RuntimeError("`omegaconf` is required to instantiate SEDD")
+        hydra_cfg = config.to_hydra()
+        self.score_model = _OrigSEDD(hydra_cfg)
+        # Make sure parameters are created on the right device / dtype.
+        self.post_init()
+    # ------------------------------------------------------------------
+    # Forward pass
+    # ------------------------------------------------------------------
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        sigma: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
+    ) -> Union[SEDDOutput, Tuple]:
+        """Run a forward pass.
+        Parameters
+        ----------
+        input_ids:
+            Token indices of shape ``(batch_size, seq_len)``.
+        sigma:
+            Noise level ("time-step") of shape ``(batch_size,)``.
+        labels:
+            *Optional* label tensor used to compute a cross-entropy training
+            loss.  If provided the returned :class:`SEDDOutput` will contain a
+            ``loss`` field.
+        """
+        logits = self.score_model(indices=input_ids, sigma=sigma)
+        loss: Optional[torch.Tensor] = None
+        if labels is not None:
+            # Standard CE loss over the last dimension (vocab)
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+        if not self.config.return_dict:
+            output: Tuple[Any, ...] = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return SEDDOutput(loss=loss, logits=logits)
+    # ------------------------------------------------------------------
+    # Weight loading helpers – we delegate to the *original* SEDD mixin so that
+    # checkpoints trained with the previous implementation can be re-used.
+    # ------------------------------------------------------------------
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        *model_args: Any,
+        **kwargs: Any,
+    ) -> "SEDDModel":
+        """Overrides the default method to allow loading legacy SEDD checkpoints
+        whose weights are saved via ``torch.save({'model': state_dict, ...})``.
+        """
+        try:
+            # First try the regular *transformers* loading routine – this will
+            # succeed if the repository follows the standard file-naming
+            # conventions (i.e. contains a ``pytorch_model.bin`` / safetensors).
+            return super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        except (EnvironmentError, RuntimeError) as e:
+            logger.info(
+                "Falling back to legacy SEDD checkpoint format because standard "
+                "loading raised: %s", e,
+            )
+            # ----------------------------------------------------------
+            # 1. Load config the usual way so we get a `SEDDConfig` instance.
+            # ----------------------------------------------------------
+            config = kwargs.pop("config", None) or SEDDConfig.from_pretrained(
+                pretrained_model_name_or_path
+            )
+            model = cls(config, *model_args, **kwargs)
+            # ----------------------------------------------------------
+            # 2. Attempt to locate the legacy *.pth* checkpoint and load it.
+            # ----------------------------------------------------------
+            import os
+            import torch as _torch
+            checkpoint_path = os.path.join(
+                pretrained_model_name_or_path, "checkpoints-meta", "checkpoint.pth"
+            )
+            if not os.path.isfile(checkpoint_path):
+                raise FileNotFoundError(
+                    "Could not find legacy SEDD checkpoint at " f"{checkpoint_path}"
+                )
+            ckpt = _torch.load(checkpoint_path, map_location="cpu")
+            state_dict = ckpt.get("model", ckpt)
+            # Strip prefix if present (sometimes stored under "module.")
+            state_dict = {
+                k.replace("module.", ""): v for k, v in state_dict.items()
+            }
+            missing, unexpected = model.load_state_dict(state_dict, strict=False)
+            if missing:
+                logger.warning("Missing keys when loading SEDD weights: %s", missing)
+            if unexpected:
+                logger.warning(
+                    "Unexpected keys when loading SEDD weights: %s", unexpected
+                )
+            return model
+###############################################################################
+# Public API                                                                  #
+###############################################################################
+__all__ = [
+    "SEDDConfig",
+    "SEDDModel",
+    "SEDDOutput",
+]