JinghuiLuAstronaut commited on 7 days ago

Commit

038d1cf

verified ·

1 Parent(s): 30b5140

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

LTA_openwebtext_dualt/logs/debug_2k_stream1024_fc_mask1_4gpu/debug_2k_stream1024_fc_mask1_4gpu_now_20260517_125945.log +147 -0
LTA_openwebtext_dualt/logs/infer/lta_owt_lm1bclassic_fullvocab_bert_c1024_len1024_elfLdim_d1280_l32_h16_ff5120_lr3e-4_gbs512_2node8gpu_1m_save10k_t-20260522071024-s2ss5_latest_step0030000_shard01_gpu1_b16.log +18 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/activations.py +369 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cache_utils.py +1623 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/configuration_utils.py +1365 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/file_utils.py +105 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/fusion_mapping.py +270 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_processing_backends.py +689 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_processing_utils.py +688 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_utils.py +1069 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/masking_utils.py +1514 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/modeling_attn_mask_utils.py +503 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/blt/configuration_blt.py +286 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/jetmoe/__init__.py +27 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/jetmoe/modeling_jetmoe.py +830 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/vitmatte/__init__.py +29 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/vitmatte/image_processing_pil_vitmatte.py +159 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/optimization.py +1342 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/tokenization_python.py +1420 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/video_utils.py +893 -0

LTA_openwebtext_dualt/logs/debug_2k_stream1024_fc_mask1_4gpu/debug_2k_stream1024_fc_mask1_4gpu_now_20260517_125945.log ADDED Viewed

	@@ -0,0 +1,147 @@

+NCCL version 2.25.1+cuda12.8
+{
+  "device": "cuda:0",
+  "rank": 0,
+  "world_size": 4,
+  "samples": "tokenized_hf:13425484:pad=0",
+  "vocab_size": 2048,
+  "tokenizer_vocab_size": 2048,
+  "save_dir": "runs/debug_2k_stream1024_fc_mask1_4gpu_now_20260517_125945",
+  "batch_size": 32,
+  "grad_accum": 2,
+  "effective_batch_size": 256,
+  "global_batch_size": 256,
+  "lr_schedule": "cosine",
+  "optimizer": "adamw",
+  "epochs": 0.0,
+  "steps_per_epoch": 52443,
+  "total_steps": 1,
+  "warmup_steps": 26222,
+  "warmup_epochs": 0.5,
+  "min_lr": 6e-05,
+  "weight_decay": 0.1,
+  "output_weight_decay": -1.0,
+  "adamw_param_groups": "nanogpt",
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_eps": 1e-08,
+  "muon_impl": "legacy",
+  "muon_momentum": 0.95,
+  "muon_ns_steps": 5,
+  "muon_update_scale": 1.0,
+  "muon_nesterov": false,
+  "muon_width_scale": false,
+  "muon_grouping": "",
+  "muon_param_count": 0,
+  "muon_adam_param_count": 0,
+  "muon_param_names": [],
+  "muon_adam_param_names": [],
+  "muon_effective_nesterov": false,
+  "muon_effective_width_scale": false,
+  "muon_effective_weight_decay": 0.1,
+  "muon_adam_fallback_nesterov": false,
+  "muon_adam_fallback_weight_decay": 0.1,
+  "ema_decay": 0.0,
+  "ema_start_step": 0,
+  "model_type": "ddit",
+  "ddit_mlp_type": "gelu",
+  "elf_num_time_tokens": 4,
+  "elf_num_model_mode_tokens": 0,
+  "qk_norm": true,
+  "output_bias": false,
+  "output_init_std": -1.0,
+  "norm_type": "rmsnorm",
+  "target_loss": "hard_ce",
+  "linear_soft_target_power": 1.0,
+  "linear_soft_target_min_conf": 0.0,
+  "linear_soft_target_max_conf": 1.0,
+  "t_sampling_mode": "logit_normal",
+  "t_sampling_power": 1.0,
+  "t_sampling_eps": 0.0001,
+  "t_sampling_logit_mean": -1.5,
+  "t_sampling_logit_std": 0.8,
+  "dual_t": true,
+  "corrupt_t_mode": "same",
+  "corrupt_min_t": 0.0,
+  "corrupt_max_t": 1.0,
+  "prefix_block_prob": 0.0,
+  "prefix_block_len": 128,
+  "mask_ratio_floor_schedule": "none",
+  "dirichlet_endpoint_mode": "categorical_dual_t",
+  "dirichlet_semantic_t_mode": "same",
+  "dirichlet_semantic_t_value": 0.0,
+  "dirichlet_semantic_t_curve": "linear",
+  "dirichlet_semantic_t_power": 1.0,
+  "endpoint_sequence_random_prob_alpha": 0.0,
+  "categorical_wrong_from_full_vocab": true,
+  "categorical_wrong_from_batch_valid_tokens": false,
+  "categorical_wrong_basin_token_ids": "",
+  "categorical_wrong_basin_prob": 0.0,
+  "categorical_wrong_unigram_prob": 0.0,
+  "categorical_wrong_uniform_prob": 0.0,
+  "categorical_wrong_corpus_unigram_path": "",
+  "categorical_wrong_corpus_unigram_alpha": 1.0,
+  "categorical_wrong_basin_shared_prob": 0.0,
+  "categorical_wrong_unigram_shared_prob": 0.0,
+  "mask_mixture_original_prob": 0.0,
+  "mask_mixture_lowk_prob": 0.0,
+  "mask_mixture_lowcorrupt_prob": 0.0,
+  "mask_mixture_block_prob": 0.0,
+  "mask_mixture_all_prob": 0.0,
+  "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
+  "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
+  "mask_mixture_block_tokens": "64,128",
+  "simplex_bridge_sampler": "dirichlet",
+  "logistic_normal_sigma_min": 0.18,
+  "logistic_normal_sigma_max": 2.2,
+  "logistic_normal_tau_min": 0.65,
+  "logistic_normal_tau_max": 1.15,
+  "torch_compile": false,
+  "compile_mode": "max-autotune",
+  "state_format": "prob",
+  "meanflow_weight": 0.0,
+  "rollout_train_prob": 0.0,
+  "rollout_train_steps": 1,
+  "rollout_train_infer_steps": 64,
+  "rollout_train_temp": 1.45,
+  "rollout_train_max_gamma": 1.0,
+  "rollout_train_corrupt_only": true,
+  "rollout_train_samplewise": false,
+  "rollout_train_compute_always": false,
+  "bridge_noise_init": "logistic_normal",
+  "noise_sigma": -1.0,
+  "allow_tf32": true,
+  "activation_checkpointing": false,
+  "activation_checkpoint_interval": 1,
+  "activation_checkpoint_scope": "block",
+  "ddp_static_graph": false,
+  "ddp_gradient_as_bucket_view": true,
+  "blocking_data_transfer": false,
+  "dataloader_prefetch_factor": 2,
+  "full_train_stats": false,
+  "tokenized_hf": true,
+  "tokenized_pad_token": "pad",
+  "elf_conditional_hf": false,
+  "record_pad_truncate": false,
+  "record_add_eos": false,
+  "record_add_special_tokens": false,
+  "record_pad_token": "pad",
+  "record_shuffle_buffer": 10000,
+  "wrap": false,
+  "wrap_mode": "stream",
+  "wrap_record_buffer_size": 200,
+  "owt_cached_chunks": false,
+  "owt_chunk_cache_dir": "",
+  "owt_chunk_cache_rebuild": false,
+  "owt_chunk_cache_write_batch": 4096,
+  "owt_exact_repeat_per_chunk": 0,
+  "online_chunk_shuffle": false,
+  "online_chunk_shuffle_buffer": 10000,
+  "openwebtext_split": "all",
+  "detokenizer": "auto",
+  "resolved_detokenizer": null,
+  "num_workers": 2,
+  "latest_every": 1000,
+  "resume_path": ""
+}
+step=1 epoch=1/1 epoch_step=1/52443 micro_steps=2 elapsed=2.6s lr=4.576310e-08 loss=7.6246 loss_recon=7.6246 loss_meanflow=0.0000 mean_model_t=0.2163 mean_corrupt_t=0.2163 mean_loss_t_weight=1.0000 linear_soft_target_mean_conf=0.0000 prior_center_loss_beta=0.0000 rollout_train_applied=0.0000 grad_enabled_before_rollout=1.0000 grad_enabled_after_rollout=1.0000 logits_requires_grad=1.0000 raw_loss_requires_grad=1.0000 acc_all=0.0000 corrupt_frac=1.0000 acc_corrupt=0.0000 loss_corrupt=7.6246 wrong_frac=0.7833 init_acc_corrupt=0.1268 acc_corrupt_t_0p0_0p2=0.0000 corrupt_frac_t_0p0_0p2=0.5469 acc_corrupt_t_0p2_0p4=0.0000 corrupt_frac_t_0p2_0p4=0.3750 acc_corrupt_t_0p4_0p6=0.0000 corrupt_frac_t_0p4_0p6=0.0625 acc_corrupt_t_0p6_0p8=0.0000 corrupt_frac_t_0p6_0p8=0.0312 out_w_norm=0.0000 out_g_norm=0.2113 loss_all=7.6246 init_gold_top10=0.1954 init_gold_top100=0.2858

LTA_openwebtext_dualt/logs/infer/lta_owt_lm1bclassic_fullvocab_bert_c1024_len1024_elfLdim_d1280_l32_h16_ff5120_lr3e-4_gbs512_2node8gpu_1m_save10k_t-20260522071024-s2ss5_latest_step0030000_shard01_gpu1_b16.log ADDED Viewed

	@@ -0,0 +1,18 @@

+[ckpt] runs/lta_owt_lm1bclassic_fullvocab_bert_c1024_len1024_elfLdim_d1280_l32_h16_ff5120_lr3e-4_gbs512_2node8gpu_1m_save10k_t-20260522071024-s2ss5/latest.pt step=30000
+[decode] steps128_c1024_t1p45 generated 16/256
+[decode] steps128_c1024_t1p45 generated 32/256
+[decode] steps128_c1024_t1p45 generated 48/256
+[decode] steps128_c1024_t1p45 generated 64/256
+[decode] steps128_c1024_t1p45 generated 80/256
+[decode] steps128_c1024_t1p45 generated 96/256
+[decode] steps128_c1024_t1p45 generated 112/256
+[decode] steps128_c1024_t1p45 generated 128/256
+[decode] steps128_c1024_t1p45 generated 144/256
+[decode] steps128_c1024_t1p45 generated 160/256
+[decode] steps128_c1024_t1p45 generated 176/256
+[decode] steps128_c1024_t1p45 generated 192/256
+[decode] steps128_c1024_t1p45 generated 208/256
+[decode] steps128_c1024_t1p45 generated 224/256
+[decode] steps128_c1024_t1p45 generated 240/256
+[decode] steps128_c1024_t1p45 generated 256/256
+[summary] {"name": "steps128_c1024_t1p45", "step": 30000, "decode_steps": 128, "concentration_max": 1024.0, "raw_genppl": 15.432598193356394, "stripped_genppl": 15.314571366003578, "sample_entropy": 3.169319289650098, "distinct_1": 0.005100250244140625, "distinct_2": 0.10675937805474096, "top_token_mass": 0.2738838195800781, "raw_kept": 256, "stripped_kept": 256}

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/activations.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+from collections import OrderedDict
+import torch
+from torch import Tensor, nn
+from .integrations.hub_kernels import use_kernel_forward_from_hub
+from .utils import logging
+from .utils.import_utils import is_torchdynamo_compiling
+logger = logging.get_logger(__name__)
+@use_kernel_forward_from_hub("GeluTanh")
+class GELUTanh(nn.Module):
+    """
+    A fast C implementation of the tanh approximation of the GeLU activation function. See
+    https://huggingface.co/papers/1606.08415.
+    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
+    match due to rounding errors.
+    """
+    def __init__(self, use_gelu_tanh_python: bool = False):
+        super().__init__()
+        if use_gelu_tanh_python:
+            self.act = self._gelu_tanh_python
+        else:
+            self.act = functools.partial(nn.functional.gelu, approximate="tanh")
+    def _gelu_tanh_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+# Added for compatibility with autoawq which is archived now and imports PytorchGELUTanh from activations.py
+PytorchGELUTanh = GELUTanh
+@use_kernel_forward_from_hub("NewGELU")
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+@use_kernel_forward_from_hub("GeLU")
+class GELUActivation(nn.Module):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
+    """
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+@use_kernel_forward_from_hub("SiLU")
+class SiLUActivation(nn.Module):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return nn.functional.silu(input)
+@use_kernel_forward_from_hub("FastGELU")
+class FastGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+@use_kernel_forward_from_hub("QuickGELU")
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return input * torch.sigmoid(1.702 * input)
+class ClippedGELUActivation(nn.Module):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://huggingface.co/papers/2004.09602.
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
+    """
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+        super().__init__()
+        self.min = min
+        self.max = max
+    def forward(self, x: Tensor) -> Tensor:
+        return torch.clip(gelu(x), self.min, self.max)
+class AccurateGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
+    https://github.com/hendrycks/GELUs
+    Implemented along with MEGA (Moving Average Equipped Gated Attention)
+    """
+    def __init__(self):
+        super().__init__()
+        self.precomputed_constant = math.sqrt(2 / math.pi)
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))
+class MishActivation(nn.Module):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+    def __init__(self):
+        super().__init__()
+        self.act = nn.functional.mish
+    def _mish_python(self, input: Tensor) -> Tensor:
+        return input * torch.tanh(nn.functional.softplus(input))
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+class LinearActivation(nn.Module):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+class LaplaceActivation(nn.Module):
+    """
+    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
+    https://huggingface.co/papers/2209.10655
+    Inspired by squared relu, but with bounded range and gradient for better stability
+    """
+    def forward(self, input, mu=0.707107, sigma=0.282095):
+        input = (input - mu).div(sigma * math.sqrt(2.0))
+        return 0.5 * (1.0 + torch.erf(input))
+class ReLUSquaredActivation(nn.Module):
+    """
+    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668
+    """
+    def forward(self, input):
+        relu_applied = nn.functional.relu(input)
+        squared = torch.square(relu_applied)
+        return squared
+class SqrtSoftplusActivation(nn.Module):
+    """sqrt(softplus(x)) — the router scoring function used by DeepSeek V4."""
+    def forward(self, input):
+        return nn.functional.softplus(input).sqrt()
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+class XIELUActivation(nn.Module):
+    """
+    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
+    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
+    Otherwise, we emit a single warning and use xIELU Python
+    """
+    def __init__(
+        self,
+        alpha_p_init=0.8,
+        alpha_n_init=0.8,
+        beta=0.5,
+        eps=-1e-6,
+        dtype=torch.bfloat16,
+        with_vector_loads=False,
+    ):
+        super().__init__()
+        self.alpha_p = nn.Parameter(torch.log(torch.expm1(torch.tensor(alpha_p_init, dtype=dtype))).unsqueeze(0))
+        self.alpha_n = nn.Parameter(
+            torch.log(torch.expm1(torch.tensor(alpha_n_init - beta, dtype=dtype))).unsqueeze(0)
+        )
+        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
+        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
+        self.with_vector_loads = with_vector_loads
+        # Temporary until xIELU CUDA fully implemented
+        self._beta_scalar = float(beta)
+        self._eps_scalar = float(eps)
+        self._xielu_cuda_obj = None
+        try:
+            import xielu.ops  # noqa: F401
+            self._xielu_cuda_obj = torch.classes.xielu.XIELU()
+            msg = "Using experimental xIELU CUDA."
+            try:
+                from torch.compiler import allow_in_graph
+                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
+                msg += " Enabled torch._dynamo for xIELU CUDA."
+            except Exception as err:
+                msg += f" Could not enable torch._dynamo for xIELU ({err}) - this may result in slower performance."
+                self._xielu_cuda_fn = self._xielu_cuda
+            logger.warning_once(msg)
+        except Exception as err:
+            logger.warning_once(
+                f"CUDA-fused xIELU not available ({err}) – falling back to a Python version.\n"
+                "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`"
+            )
+    def _xielu_python(self, x: Tensor) -> Tensor:
+        alpha_p = nn.functional.softplus(self.alpha_p)
+        alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
+        return torch.where(
+            x > 0,
+            alpha_p * x * x + self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
+        )
+    def _xielu_cuda(self, x: Tensor) -> Tensor:
+        """Firewall function to prevent torch.compile from seeing .item() calls"""
+        original_shape = x.shape
+        # CUDA kernel expects 3D tensors, reshape if needed
+        while x.dim() < 3:
+            x = x.unsqueeze(0)
+        if x.dim() > 3:
+            x = x.view(-1, 1, x.size(-1))
+        if original_shape != x.shape:
+            logger.warning_once(
+                "Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).",
+                original_shape,
+                x.shape,
+            )
+        result = self._xielu_cuda_obj.forward(
+            x,
+            self.alpha_p.to(x.dtype),
+            self.alpha_n.to(x.dtype),
+            # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item()
+            self._beta_scalar,
+            self._eps_scalar,
+            self.with_vector_loads,
+        )
+        return result.view(original_shape)
+    def forward(self, input: Tensor) -> Tensor:
+        if self._xielu_cuda_obj is not None and input.is_cuda:
+            if not is_torchdynamo_compiling():
+                return self._xielu_cuda_fn(input)
+            else:
+                logger.warning_once("torch._dynamo is compiling, using Python version of xIELU.")
+        return self._xielu_python(input)
+ACT2CLS = {
+    "gelu": GELUActivation,
+    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
+    "gelu_fast": FastGELUActivation,
+    "gelu_new": NewGELUActivation,
+    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "gelu_pytorch_tanh": GELUTanh,
+    "gelu_python_tanh": (GELUTanh, {"use_gelu_tanh_python": True}),
+    "gelu_accurate": AccurateGELUActivation,
+    "hardswish": nn.Hardswish,
+    "laplace": LaplaceActivation,
+    "leaky_relu": nn.LeakyReLU,
+    "linear": LinearActivation,
+    "mish": MishActivation,
+    "quick_gelu": QuickGELUActivation,
+    "relu": nn.ReLU,
+    "relu2": ReLUSquaredActivation,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": SiLUActivation,
+    "sqrtsoftplus": SqrtSoftplusActivation,
+    "swish": nn.SiLU,
+    "tanh": nn.Tanh,
+    "prelu": nn.PReLU,
+    "xielu": XIELUActivation,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+gelu_pytorch_tanh = get_activation("gelu_pytorch_tanh")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cache_utils.py ADDED Viewed

	@@ -0,0 +1,1623 @@

+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+import torch
+from .configuration_utils import PreTrainedConfig
+from .utils import (
+    is_hqq_available,
+    is_optimum_quanto_available,
+    is_quanto_greater,
+    is_torch_greater_or_equal,
+    is_torchdynamo_compiling,
+    logging,
+)
+if is_hqq_available():
+    from hqq.core.quantize import Quantizer as HQQQuantizer
+_is_torch_greater_or_equal_than_2_7 = is_torch_greater_or_equal("2.7", accept_dev=True)
+logger = logging.get_logger(__name__)
+# Registry mapping ``config.layer_types[i]`` -> the dynamic cache layer class to build for
+# that layer. ``DynamicCache.__init__`` consults this mapping when a ``config`` is provided
+# so models with custom layer types (e.g. DeepSeek-V4's CSA / HCA) can register their own
+# cache-layer subclass and stop needing a model-specific ``Cache`` subclass.
+#
+# A cache layer subclass with a class attribute ``layer_type = "..."`` auto-registers via
+# ``CacheLayerMixin.__init_subclass__``. Each registered class must accept a
+# ``PreTrainedConfig`` (the decoder text config) as the only positional argument.
+LAYER_TYPE_CACHE_MAPPING: dict[str, type] = {}
+class CacheLayerMixin(ABC):
+    """Base, abstract class for a single layer's cache."""
+    is_compileable = False
+    # Subclasses can set ``layer_type`` to auto-register themselves in
+    # ``LAYER_TYPE_CACHE_MAPPING`` at import time (used by ``DynamicCache`` to dispatch
+    # per-layer cache classes from ``config.layer_types``).
+    layer_type: str | None = None
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        layer_type = cls.__dict__.get("layer_type", None)
+        if layer_type is not None:
+            LAYER_TYPE_CACHE_MAPPING[layer_type] = cls
+    def __init__(self):
+        self.keys: torch.Tensor | None = None
+        self.values: torch.Tensor | None = None
+        self.is_initialized = False
+    def __repr__(self):
+        return f"{self.__class__.__name__}"
+    @abstractmethod
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None: ...
+    @abstractmethod
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
+    @abstractmethod
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]: ...
+    @abstractmethod
+    def get_seq_length(self) -> int: ...
+    @abstractmethod
+    def get_max_cache_shape(self) -> int: ...
+    def offload(self):
+        """Offload this layer's data to CPU device."""
+        if self.is_initialized:
+            self.keys = self.keys.to("cpu", non_blocking=True)
+            self.values = self.values.to("cpu", non_blocking=True)
+    def prefetch(self):
+        """In case of layer offloading, this allows to move the data back to the layer's device ahead of time."""
+        if self.is_initialized and self.keys.device != self.device:
+            self.keys = self.keys.to(self.device, non_blocking=True)
+            self.values = self.values.to(self.device, non_blocking=True)
+    def reset(self) -> None:
+        """Resets the cache values while preserving the objects"""
+        if self.is_initialized:
+            self.keys.zero_()
+            self.values.zero_()
+        # This attribute is set on several Layers
+        if hasattr(self, "cumulative_length"):
+            # It can either be an int for dynamic layers, or a tensor for static layers
+            if isinstance(self.cumulative_length, int):
+                self.cumulative_length = 0
+            else:
+                self.cumulative_length.zero_()
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
+        """Reorders this layer's cache for beam search."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys.index_select(0, beam_idx.to(self.keys.device))
+            self.values = self.values.index_select(0, beam_idx.to(self.values.device))
+class DynamicLayer(CacheLayerMixin):
+    """
+    A cache layer that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the key and value states as tensors of shape `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    is_sliding = False
+    def __init__(self, config: PreTrainedConfig | None = None):
+        super().__init__()
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        self.dtype, self.device = key_states.dtype, key_states.device
+        self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.values = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.is_initialized = True
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+        self.keys = torch.cat([self.keys, key_states], dim=-2)
+        self.values = torch.cat([self.values, value_states], dim=-2)
+        return self.keys, self.values
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the mask"""
+        kv_offset = 0
+        kv_length = self.get_seq_length() + query_length
+        return kv_length, kv_offset
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        if not self.is_initialized or self.keys.numel() == 0:
+            return 0
+        return self.keys.shape[-2]
+    def get_max_cache_shape(self) -> int:
+        """Returns the maximum sequence length of the cache object. DynamicLayer does not have a maximum length."""
+        return -1
+    def crop(self, max_length: int) -> None:
+        """
+        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be negative
+        to remove `max_length` tokens.
+        """
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+        if self.get_seq_length() <= max_length:
+            return
+        self.keys = self.keys[..., :max_length, :]
+        self.values = self.values[..., :max_length, :]
+    def batch_repeat_interleave(self, repeats: int) -> None:
+        """Repeat the cache `repeats` times in the batch dimension."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys.repeat_interleave(repeats, dim=0)
+            self.values = self.values.repeat_interleave(repeats, dim=0)
+    def batch_select_indices(self, indices: torch.Tensor) -> None:
+        """Only keep the `indices` in the batch dimension of the cache."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys[indices, ...]
+            self.values = self.values[indices, ...]
+class DynamicSlidingWindowLayer(DynamicLayer):
+    """
+    A cache layer that grows dynamically as more tokens are generated, up until the sliding window size.
+    It stores the key and value states as tensors of shape `[batch_size, num_heads, min(seq_len, sliding_window), head_dim]`.
+    """
+    is_sliding = True
+    def __init__(self, config: PreTrainedConfig | None = None, sliding_window: int | None = None):
+        super().__init__()
+        # Accept either a config (registry-style construction via LAYER_TYPE_CACHE_MAPPING)
+        # or a raw ``sliding_window`` int (legacy callers).
+        if sliding_window is None:
+            if config is None:
+                raise ValueError("Either `config` or `sliding_window` must be provided.")
+            sliding_window = getattr(config, "sliding_window", None) or getattr(config, "attention_chunk_size", None)
+        self.sliding_window = sliding_window
+        self.cumulative_length = 0
+        self._sliding_window_tensor = torch.tensor(self.sliding_window, dtype=torch.long)
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        super().lazy_initialization(key_states, value_states)
+        self._sliding_window_tensor = self._sliding_window_tensor.to(self.device)
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+        self.cumulative_length += key_states.shape[-2]
+        # Compute the full states
+        full_key_states = torch.cat([self.keys, key_states], dim=-2)
+        full_value_states = torch.cat([self.values, value_states], dim=-2)
+        # Only cache the last `self.sliding_window - 1` tokens (or all of them if lower than that)
+        self.keys = full_key_states[:, :, -self.sliding_window + 1 :, :]
+        self.values = full_value_states[:, :, -self.sliding_window + 1 :, :]
+        # Return the full states
+        return full_key_states, full_value_states
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the attention mask"""
+        is_full = self.cumulative_length >= self.sliding_window
+        kv_offset = max(self.cumulative_length - self.sliding_window + 1, 0)
+        if is_full:
+            kv_length = self.sliding_window - 1 + query_length
+        else:
+            kv_length = self.cumulative_length + query_length
+        return kv_length, kv_offset
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        return self.cumulative_length
+    def get_max_cache_shape(self) -> int:
+        """Return the maximum cache shape of the cache"""
+        return self.sliding_window
+    def crop(self, max_length: int) -> None:
+        """
+        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
+        negative to remove `max_length` tokens.
+        """
+        if self.get_seq_length() >= self.sliding_window:
+            raise ValueError(
+                "Cannot `crop` a `DynamicSlidingWindowLayer` after it has seen more tokens than its"
+                "sliding window (otherwise some states are lost)"
+            )
+        super().crop(max_length)
+        self.cumulative_length = self.keys.shape[-2]
+class StaticLayer(CacheLayerMixin):
+    """
+    A static cache layer that stores the key and value states as static tensors of shape `[batch_size, num_heads, max_cache_len), head_dim]`.
+    It lazily allocates its full backing tensors, and then mutates them in-place. Built for `torch.compile` support.
+    Args:
+        max_cache_len (`int`):
+            Maximum number of tokens that can be stored, used for tensor preallocation.
+    """
+    is_compileable = True
+    is_sliding = False
+    def __init__(self, max_cache_len: int):
+        super().__init__()
+        self.max_cache_len = max_cache_len
+        # Very important that it's a tensor here, to avoid recompiling when we update it and use it to create positions
+        self.cumulative_length = torch.tensor([0], dtype=int)
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        """
+        Lazy initialization of the keys and values tensors. This allows to get all properties (dtype, device,
+        num_heads in case of TP etc...) at runtime directly, which is extremely practical as it avoids moving
+        devices, dtypes etc later on for each `update` (which could break the static dynamo addresses as well).
+        If this is unwanted, one can call `early_initialization(...)` on the Cache directly, which will call this
+        function ahead-of-time (this is required for `torch.export` for example). Note that for `compile`, as we
+        internally don't compile the prefill, this is guaranteed to have been called already when compiling.
+        If compiling the prefill as well, e.g. calling `model.compile(...)` before `generate` with a static cache,
+        it is still supported in general, but without guarantees depending on the compilation options (e.g. cuda graphs,
+        i.e. `mode="reduce-overhead"` is known to fail). But it will in general work correctly, and prefill should
+        not be compiled anyway for performances!
+        """
+        self.dtype, self.device = key_states.dtype, key_states.device
+        self.max_batch_size, self.num_heads = key_states.shape[:2]
+        self.v_head_dim = value_states.shape[-1]
+        self.k_head_dim = key_states.shape[-1]
+        self.keys = torch.zeros(
+            (self.max_batch_size, self.num_heads, self.max_cache_len, self.k_head_dim),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        self.values = torch.zeros(
+            (self.max_batch_size, self.num_heads, self.max_cache_len, self.v_head_dim),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        self.cumulative_length = self.cumulative_length.to(self.device)
+        # Note: `mark_static_address` is used to tag the tensors as a fixed data pointer, preventing compiled graph
+        # breaks or cudagraph skips due to inplace mutations when updating the cache. However, it is not supported when
+        # tracing the graph, so we skip it in this case. As prefill should never be compiled, this is not an issue and it
+        # will still be run (except when users compile prefill explicitly, but this should be avoided!)
+        # Without this, we cannot use cudagraphs!!
+        if not is_torchdynamo_compiling():
+            torch._dynamo.mark_static_address(self.keys)
+            torch._dynamo.mark_static_address(self.values)
+            torch._dynamo.mark_static_address(self.cumulative_length)
+        self.is_initialized = True
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+        # Create a tensor to slice the static kv at the correct indices
+        kv_length = key_states.shape[-2]
+        cache_position = torch.arange(kv_length, device=self.device) + self.cumulative_length
+        # Note that has to be performed in-place, as we have a static address that we need to keep
+        self.cumulative_length.add_(kv_length)
+        # Update the cache
+        try:
+            self.keys.index_copy_(2, cache_position, key_states)
+            self.values.index_copy_(2, cache_position, value_states)
+        except NotImplementedError:
+            # Fallback for devices like MPS where index_copy_ might not be supported.
+            self.keys[:, :, cache_position] = key_states
+            self.values[:, :, cache_position] = value_states
+        return self.keys, self.values
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the attention mask"""
+        kv_offset = 0
+        kv_length = self.max_cache_len
+        return kv_length, kv_offset
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        return self.cumulative_length if self.is_initialized else 0
+    def get_max_cache_shape(self) -> int:
+        """Return the maximum cache shape of the cache"""
+        return self.max_cache_len
+class StaticSlidingWindowLayer(StaticLayer):
+    """
+    A static cache layer that stores the key and value states as static tensors of shape
+    `[batch_size, num_heads, min(max_cache_len, sliding_window), head_dim]`. It lazily allocates its full backing
+    tensors, and then mutates them in-place. Built for `torch.compile` support.
+    Args:
+        max_cache_len (`int`):
+            Maximum number of tokens that can be stored, used for tensor preallocation.
+        sliding_window (`int`):
+            The size of the sliding window.
+    """
+    is_sliding = True
+    def __init__(self, max_cache_len: int, sliding_window: int):
+        effective_max_cache_len = min(sliding_window, max_cache_len)
+        super().__init__(max_cache_len=effective_max_cache_len)
+        # Here, to avoid data-dependent control flows, we also need to use a python int to keep track of the cumulative length
+        self.cumulative_length_int = 0
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+        kv_length = key_states.shape[-2]
+        current_length = self.cumulative_length_int
+        is_full = current_length >= self.max_cache_len
+        # Update it now that we saved the value above
+        self.cumulative_length_int += kv_length
+        if is_full:
+            # In general, we should use a much simpler `cat` here as well, independently of the states size. However,
+            # dynamo is currently bugged when doing it - see https://github.com/pytorch/pytorch/issues/159855 for more details
+            if key_states.shape[-2] == 1:
+                # Roll all values to the left by 1 position
+                new_keys = self.keys.roll(-1, dims=-2)
+                new_values = self.values.roll(-1, dims=-2)
+                # Overwrite the last position with new states
+                # (note: very important to use a tensor to index here, see https://github.com/pytorch/pytorch/issues/159855)
+                index = torch.tensor([-1], dtype=int, device=self.device)
+                new_keys[:, :, index] = key_states
+                new_values[:, :, index] = value_states
+                # Copy back into `self` (do not just assign again) in order to keep the static dynamo address
+                self.keys.copy_(new_keys)
+                self.values.copy_(new_values)
+                # Very important to return the `self` tensors here, as they have the static dynamo address
+                return self.keys, self.values
+            # Already full but using more than 1 new token (e.g. prefill caching, chat continuation, etc...)
+            else:
+                full_key_states = torch.cat((self.keys[:, :, 1:, :], key_states), dim=-2)
+                full_value_states = torch.cat((self.values[:, :, 1:, :], value_states), dim=-2)
+        # Not yet full, but becoming full on this update
+        elif current_length + kv_length > self.max_cache_len:
+            # Fast prefill path, no need to cat() in this case, as the cache is currently empty
+            if current_length == 0:
+                full_key_states = key_states
+                full_value_states = value_states
+            else:
+                full_key_states = torch.cat((self.keys[:, :, :current_length, :], key_states), dim=-2)
+                full_value_states = torch.cat((self.values[:, :, :current_length, :], value_states), dim=-2)
+        else:
+            # Note: very important to use the tensor version of the cumulative length here, as otherwise cudagraphs
+            # (triggered by mode="reduced_overhead") will lead to random crashes, as the int would be overwritten
+            cache_position = torch.arange(kv_length, device=self.device) + self.cumulative_length
+            try:
+                self.keys.index_copy_(2, cache_position, key_states)
+                self.values.index_copy_(2, cache_position, value_states)
+            except NotImplementedError:
+                self.keys[:, :, cache_position] = key_states
+                self.values[:, :, cache_position] = value_states
+            # Update the tensor version of the length in-place (we don't need to update it if we are already outside
+            # of this branch, as we don't need the tensor anymore)
+            self.cumulative_length.add_(kv_length)
+            # Very important to return the `self` tensors here, as they have the static dynamo address
+            return self.keys, self.values
+        # We only cache the last `sliding_window` tokens
+        self.keys.copy_(full_key_states[:, :, -self.max_cache_len :, :])
+        self.values.copy_(full_value_states[:, :, -self.max_cache_len :, :])
+        # we should return the whole states instead of `self.keys/values` here, as otherwise we lose some context
+        return full_key_states, full_value_states
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the attention mask"""
+        sliding_window = self.max_cache_len
+        is_full = self.cumulative_length_int >= self.max_cache_len
+        kv_offset = max(self.cumulative_length_int - sliding_window + 1, 0)
+        # The cache is already full
+        if is_full:
+            kv_length = sliding_window + query_length - 1
+        # Not yet full, but becoming full on this update
+        elif self.cumulative_length_int + query_length > sliding_window:
+            kv_length = self.cumulative_length_int + query_length
+        # Here the Cache is still smaller than the local size, but we return the local size as it's static
+        else:
+            kv_length = sliding_window
+        return kv_length, kv_offset
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        return self.cumulative_length_int
+    def reset(self):
+        super().reset()
+        self.cumulative_length_int = 0
+class QuantizedLayer(DynamicLayer):
+    """
+    A quantized layer similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://huggingface.co/papers/2402.02750).
+    It allows the model to generate longer sequence length without allocating too much memory for the key and value caches by
+    applying quantization.
+    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length`
+    is set as a maximum capacity for the original precision cache. When the length goes beyond maximum capacity, the original
+    precision cache is discarded and moved into the quantized cache. The quantization is done per-channel with a set `q_group_size`
+    for both Keys and Values, in contrast to what was described in the paper.
+    """
+    def __init__(
+        self,
+        nbits: int = 4,
+        axis_key: int = 0,
+        axis_value: int = 0,
+        q_group_size: int = 64,
+        residual_length: int = 128,
+    ):
+        super().__init__()
+        self.nbits = nbits
+        self.axis_key = axis_key
+        self.axis_value = axis_value
+        self.q_group_size = q_group_size
+        self.residual_length = residual_length
+        self.cumulative_length = 0
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        self.cumulative_length += key_states.shape[-2]
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+            self._quantized_keys = self._quantize(key_states.contiguous(), axis=self.axis_key)
+            self._quantized_values = self._quantize(value_states.contiguous(), axis=self.axis_value)
+            return key_states, value_states
+        dequant_keys = self._dequantize(self._quantized_keys)
+        dequant_values = self._dequantize(self._quantized_values)
+        keys_to_return = torch.cat([dequant_keys, self.keys, key_states], dim=-2)
+        values_to_return = torch.cat([dequant_values, self.values, value_states], dim=-2)
+        if self.keys.dim() == 4 and self.keys.shape[-2] + 1 >= self.residual_length:
+            self._quantized_keys = self._quantize(keys_to_return.contiguous(), axis=self.axis_key)
+            self._quantized_values = self._quantize(values_to_return.contiguous(), axis=self.axis_value)
+            self.keys = torch.tensor([], dtype=key_states.dtype, device=key_states.device)
+            self.values = torch.tensor([], dtype=key_states.dtype, device=key_states.device)
+        else:
+            self.keys = torch.cat([self.keys, key_states], dim=-2)
+            self.values = torch.cat([self.values, value_states], dim=-2)
+        return keys_to_return, values_to_return
+    @abstractmethod
+    def _quantize(self, tensor, axis): ...
+    @abstractmethod
+    def _dequantize(self, q_tensor): ...
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        return self.cumulative_length
+class QuantoQuantizedLayer(QuantizedLayer):
+    def __init__(
+        self,
+        nbits: int = 4,
+        axis_key: int = 0,
+        axis_value: int = 0,
+        q_group_size: int = 64,
+        residual_length: int = 128,
+    ):
+        super().__init__(
+            nbits=nbits,
+            axis_key=axis_key,
+            axis_value=axis_value,
+            q_group_size=q_group_size,
+            residual_length=residual_length,
+        )
+        # We need to import quanto here to avoid circular imports due to optimum/quanto/models/transformers_models.py
+        if not is_optimum_quanto_available():
+            raise ImportError(
+                "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto "
+                "backend. Please install it via  with `pip install optimum-quanto`"
+            )
+        elif is_quanto_greater("0.2.5", accept_dev=True):
+            from optimum.quanto import MaxOptimizer, qint2, qint4
+        else:
+            raise ImportError(
+                "You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedLayer`. "
+            )
+        if self.nbits not in [2, 4]:
+            raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}")
+        if self.axis_key not in [0, -1]:
+            raise ValueError(f"`axis_key` for `quanto` backend has to be one of [`0`, `-1`] but got {self.axis_key}")
+        if self.axis_value not in [0, -1]:
+            raise ValueError(
+                f"`axis_value` for `quanto` backend has to be one of [`0`, `-1`] but got {self.axis_value}"
+            )
+        self.qtype = qint4 if self.nbits == 4 else qint2
+        self.optimizer = MaxOptimizer()  # hardcode as it's the only one for per-channel quantization
+    def _quantize(self, tensor, axis):
+        from optimum.quanto import quantize_weight
+        scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
+        qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
+        return qtensor
+    def _dequantize(self, qtensor):
+        return qtensor.dequantize()
+class HQQQuantizedLayer(QuantizedLayer):
+    def __init__(
+        self,
+        nbits: int = 4,
+        axis_key: int = 0,
+        axis_value: int = 0,
+        q_group_size: int = 64,
+        residual_length: int = 128,
+    ):
+        super().__init__(
+            nbits=nbits,
+            axis_key=axis_key,
+            axis_value=axis_value,
+            q_group_size=q_group_size,
+            residual_length=residual_length,
+        )
+        if not is_hqq_available():
+            raise ImportError(
+                "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
+                "Please install it via  with `pip install hqq`"
+            )
+        if self.nbits not in [1, 2, 3, 4, 8]:
+            raise ValueError(
+                f"`nbits` for `HQQ` backend has to be one of [`1`, `2`, `3`, `4`, `8`] but got {self.nbits}"
+            )
+        if self.axis_key not in [0, 1]:
+            raise ValueError(f"`axis_key` for `HQQ` backend has to be one of [`0`, `1`] but got {self.axis_key}")
+        if self.axis_value not in [0, 1]:
+            raise ValueError(f"`axis_value` for `HQQ` backend has to be one of [`0`, `1`] but got {self.axis_value}")
+        self.quantizer = HQQQuantizer
+    def _quantize(self, tensor, axis):
+        qtensor, meta = self.quantizer.quantize(
+            tensor,
+            axis=axis,
+            device=self.keys.device,
+            compute_dtype=self.keys.dtype,
+            nbits=self.nbits,
+            group_size=self.q_group_size,
+        )
+        meta["compute_dtype"] = self.keys.dtype
+        self.quantizer.cuda(qtensor, meta=meta, device=self.keys.device)  # Move to device and cast to dtype
+        meta["scale"] = meta["scale"].to(qtensor.device)
+        meta["zero"] = meta["zero"].to(qtensor.device)
+        return qtensor, meta
+    def _dequantize(self, qtensor):
+        quant_tensor, meta = qtensor
+        tensor = self.quantizer.dequantize(quant_tensor, meta)
+        return tensor
+class LinearAttentionCacheLayerMixin(ABC):
+    """Base, abstract class for a linear attention single layer's cache."""
+    # All shapes are static by essence in a LinearAttention layer, so it is compileable
+    is_compileable = True
+    def __init__(self):
+        self.conv_states: torch.Tensor | None = None
+        self.recurrent_states: torch.Tensor | None = None
+        self.is_conv_states_initialized = False
+        self.is_recurrent_states_initialized = False
+        self.has_previous_state = False
+    def __repr__(self):
+        return f"{self.__class__.__name__}"
+    @abstractmethod
+    def lazy_initialization(
+        self, conv_states: torch.Tensor | None = None, recurrent_states: torch.Tensor | None = None
+    ) -> None: ...
+    @abstractmethod
+    def update_conv_state(self, conv_states: torch.Tensor) -> torch.Tensor: ...
+    @abstractmethod
+    def update_recurrent_state(self, recurrent_states: torch.Tensor) -> torch.Tensor: ...
+    def offload(self):
+        """Offload this layer's data to CPU device."""
+        if self.is_conv_states_initialized:
+            self.conv_states = self.conv_states.to("cpu", non_blocking=True)
+        if self.is_recurrent_states_initialized:
+            self.recurrent_states = self.recurrent_states.to("cpu", non_blocking=True)
+    def prefetch(self):
+        """In case of layer offloading, this allows to move the data back to the layer's device ahead of time."""
+        if self.is_conv_states_initialized and self.conv_states.device != self.device:
+            self.conv_states = self.conv_states.to(self.device, non_blocking=True)
+        if self.is_recurrent_states_initialized and self.recurrent_states.device != self.device:
+            self.recurrent_states = self.recurrent_states.to(self.device, non_blocking=True)
+    def reset(self) -> None:
+        """Resets the cache values while preserving the objects"""
+        if self.is_conv_states_initialized:
+            self.conv_states.zero_()
+        if self.is_recurrent_states_initialized:
+            self.recurrent_states.zero_()
+        self.has_previous_state = False
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        if self.is_conv_states_initialized:
+            self.conv_states = self.conv_states.index_select(0, beam_idx.to(self.device))
+        # recurrent_states can stay empty sometimes, see e.g. lfm2 which only uses the conv_states
+        if self.is_recurrent_states_initialized:
+            self.recurrent_states = self.recurrent_states.index_select(0, beam_idx.to(self.device))
+    def crop(self, max_length: int):
+        # We don't crop the linear attention cache, so simply do nothing here
+        pass
+class LinearAttentionLayer(LinearAttentionCacheLayerMixin):
+    def __init__(self, config: PreTrainedConfig | None = None):
+        super().__init__()
+    def lazy_initialization(
+        self, conv_states: torch.Tensor | None = None, recurrent_states: torch.Tensor | None = None
+    ) -> None:
+        # Here, we will lazy init both states separately, each in their own update function
+        if conv_states is not None:
+            self.dtype, self.device = conv_states.dtype, conv_states.device
+            # Even if prefill is larfer/shorter than the conv_size, the tensor is always either padded or truncated
+            self.max_batch_size, self.conv_kernel_size = conv_states.shape[0], conv_states.shape[-1]
+            # The shape is always static, so we init as such
+            self.conv_states = torch.zeros_like(conv_states, dtype=self.dtype, device=self.device)
+            # Mark as static address to be able to use cudagraphs
+            if not is_torchdynamo_compiling():
+                torch._dynamo.mark_static_address(self.conv_states)
+            self.is_conv_states_initialized = True
+        if recurrent_states is not None:
+            # The shape is always static, so we init as such
+            self.recurrent_states = torch.zeros_like(recurrent_states, dtype=self.dtype, device=self.device)
+            # Mark as static address to be able to use cudagraphs
+            if not is_torchdynamo_compiling():
+                torch._dynamo.mark_static_address(self.recurrent_states)
+            self.is_recurrent_states_initialized = True
+    def update_conv_state(self, conv_states: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Update the linear attention cache in-place, and return the necessary conv states.
+        Args:
+            conv_states (`torch.Tensor`): The new conv states to cache.
+        Returns:
+            `torch.Tensor`: The updated conv states.
+        """
+        # Lazy initialization
+        if not self.is_conv_states_initialized:
+            self.lazy_initialization(conv_states=conv_states)
+        if not self.has_previous_state:
+            # Note that we copy instead of assigning, to preserve the static address for cudagraphs
+            self.conv_states.copy_(conv_states)
+            self.has_previous_state = True
+        # Technically, this update is not logically correct if the prefill is smaller than `conv_kernel_size`,
+        # as it will `roll` anyway in the first decoding step, even though it should `roll` ONLY if the cache is already full.
+        # But since `conv_kernel_size=4` in practice, it's almost impossible to have a smaller prefill so it's mostly fine for now
+        else:
+            # Note that we copy instead of assigning, to preserve the static address for cudagraphs
+            num_new_tokens = conv_states.shape[-1]
+            if num_new_tokens >= self.conv_kernel_size:
+                self.conv_states.copy_(conv_states[..., -self.conv_kernel_size :])
+            else:
+                new_conv_states = self.conv_states.roll(shifts=-num_new_tokens, dims=-1)
+                new_conv_states[:, :, -num_new_tokens:] = conv_states
+                self.conv_states.copy_(new_conv_states)
+        return self.conv_states
+    def update_recurrent_state(self, recurrent_states: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Update the linear attention cache in-place, and return the necessary ssm states.
+        Args:
+            smm_states (`torch.Tensor`): The new ssm states to cache.
+        Returns:
+            `torch.Tensor`: The updated ssm states.
+        """
+        if not self.is_recurrent_states_initialized:
+            self.lazy_initialization(recurrent_states=recurrent_states)
+        # Note that we copy instead of assigning, to preserve the static address for cudagraphs
+        self.recurrent_states.copy_(recurrent_states)
+        return self.recurrent_states
+class LinearAttentionAndFullAttentionLayer(LinearAttentionLayer, DynamicLayer):
+    # The dynamic Attention part makes it non-compileable
+    is_compileable = False
+    def __init__(self, config: PreTrainedConfig | None = None):
+        DynamicLayer.__init__(self)
+        LinearAttentionLayer.__init__(self)
+    def lazy_initialization(self, *args, **kwargs) -> None:
+        # When the Attention cache is used with `update`, `lazy_initialization` is called with 2 positional args
+        if len(args) == 2 and len(kwargs) == 0:
+            DynamicLayer.lazy_initialization(self, *args)
+        # Otherwise, for the LinearAttention cache, when it's called in `update_conv_state` or `update_recurrent_state`, it's
+        # always called with 1 single kwarg (cause it needs to know if it's for the conv or ssm states)
+        if len(args) == 0 and len(kwargs) == 1:
+            LinearAttentionLayer.lazy_initialization(self, **kwargs)
+    def reset(self) -> None:
+        LinearAttentionLayer.reset(self)
+        DynamicLayer.reset(self)
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        LinearAttentionLayer.reorder_cache(self, beam_idx)
+        DynamicLayer.reorder_cache(self, beam_idx)
+# Pre-register the standard layer types (some classes are shared between multiple types,
+# e.g. ``DynamicSlidingWindowLayer`` covers both ``"sliding_attention"`` and
+# ``"chunked_attention"`` — those need an explicit map entry rather than the
+# auto-registration via ``CacheLayerMixin.__init_subclass__``).
+LAYER_TYPE_CACHE_MAPPING.update(
+    {
+        "full_attention": DynamicLayer,
+        # From a cache point of view, sliding and chunked are the same in how they should behave;
+        # only the mask differs.
+        "sliding_attention": DynamicSlidingWindowLayer,
+        "chunked_attention": DynamicSlidingWindowLayer,
+        # Linear-attention-shaped layers (mamba / conv / pure linear-attention / moe placeholders)
+        # don't grow per-token KV; they're tracked just so position bookkeeping stays consistent.
+        "mamba": LinearAttentionLayer,
+        "conv": LinearAttentionLayer,
+        "linear_attention": LinearAttentionLayer,
+        "moe": LinearAttentionLayer,
+        # Hybrid layers (e.g. zamba / zamba2) carry both a linear-attention state and a dynamic-attention state.
+        "hybrid": LinearAttentionAndFullAttentionLayer,
+    }
+)
+class Cache:
+    """
+    A `Cache` is mostly a list of `CacheLayerMixin` objects, one per model layer. It serves as a container for
+    the Cache of each layer.
+    Args:
+        layers (`Optional`, *optional*):
+            A list of pre-created `CacheLayerMixin` or `LinearAttentionCacheLayerMixin`. If omitted (`None`), then `layer_class_to_replicate`
+            will be used.
+        layer_class_to_replicate (`type[CacheLayerMixin | LinearAttentionCacheLayerMixin]`, *optional*):
+            Only used if `layers` is omitted (`None`), in which case it will be used as the base class for each layer,
+            and the layers will be added lazily as soon as `update` is called with a `layer_idx` greater than the current
+            list of layers.
+        offloading (`bool`, *optional*, defaults to `False`):
+            Whether to perform offloading of the layers to `cpu`, to save GPU memory.
+        offload_only_non_sliding (`bool`, *optional*, defaults to `True`):
+            If `offloading` is `True`, this further decides if only the non-sliding layers will be offloaded (because
+            usually the sliding layers are small in size, so there is no need to offload them, and skipping it is faster).
+    """
+    def __init__(
+        self,
+        layers: list[CacheLayerMixin | LinearAttentionCacheLayerMixin] | None = None,
+        layer_class_to_replicate: type[CacheLayerMixin | LinearAttentionCacheLayerMixin] | None = None,
+        offloading: bool = False,
+        offload_only_non_sliding: bool = True,
+    ):
+        if layers is not None and layer_class_to_replicate is not None:
+            raise ValueError(
+                "You can construct a Cache either from a list `layers` of all the predefined `CacheLayer`, or from a "
+                "`layer_class_to_replicate`, in which case the Cache will append a new layer corresponding to "
+                "`layer_class_to_replicate` for each new call to `update` with an idx not already in the Cache."
+            )
+        if layers is None and layer_class_to_replicate is None:
+            raise ValueError(
+                "You should provide exactly one of `layers` or `layer_class_to_replicate` to initialize a Cache."
+            )
+        self.layers = layers if layers is not None else []
+        self.layer_class_to_replicate = layer_class_to_replicate
+        self.offloading = offloading
+        if self.offloading:
+            self.only_non_sliding = offload_only_non_sliding
+            self.prefetch_stream = torch.Stream() if _is_torch_greater_or_equal_than_2_7 else torch.cuda.Stream()
+    def __repr__(self):
+        return f"{self.__class__.__name__}(layers={self.layers})"
+    def prefetch(self, layer_idx: int, only_non_sliding: bool = True):
+        """
+        Prefetch a given layer on its device. If `only_non_sliding` is True, it will try to prefetch only the layers
+        which are non-sliding. If the `layer_idx` is outside the range, this will circle back to the first layers.
+        Note that we use a non-default stream for this, to avoid blocking.
+        """
+        if only_non_sliding:
+            # Try to find next non-sliding, starting at `layer_idx`
+            try:
+                layer_idx = layer_idx + self.is_sliding[layer_idx:].index(False)
+            # In this case, we need to circle back to the beginning
+            except ValueError:
+                layer_idx = self.is_sliding.index(False)
+        else:
+            layer_idx = layer_idx if layer_idx < len(self.layers) else 0
+        # Prefetch
+        with self.prefetch_stream if _is_torch_greater_or_equal_than_2_7 else torch.cuda.stream(self.prefetch_stream):
+            self.layers[layer_idx].prefetch()
+    def offload(self, layer_idx: int, only_non_sliding: bool = True):
+        """
+        Offload a given `layer_idx`. If `only_non_sliding` is True, it will offload `layer_idx` only if it is a
+        non-sliding layer. Note that we do it on the default stream, so that we ensure all earlier
+        computation in the layer's `update` methods are finished.
+        """
+        if not (only_non_sliding and self.is_sliding[layer_idx]):
+            self.layers[layer_idx].offload()
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # In this case, the `layers` were not provided, and we must append as much as `layer_idx`
+        if self.layer_class_to_replicate is not None:
+            while len(self.layers) <= layer_idx:
+                self.layers.append(self.layer_class_to_replicate())
+        if self.offloading:
+            # Wait for the stream to finish if needed, and start prefetching the next layer
+            torch.cuda.default_stream(key_states.device).wait_stream(self.prefetch_stream)
+            self.prefetch(layer_idx + 1, self.only_non_sliding)
+        keys, values = self.layers[layer_idx].update(key_states, value_states, *args, **kwargs)
+        if self.offloading:
+            self.offload(layer_idx, self.only_non_sliding)
+        return keys, values
+    def update_conv_state(self, conv_states: torch.Tensor, layer_idx: int, **kwargs) -> torch.Tensor:
+        """
+        Updates the cache with the new `conv_states` for the layer `layer_idx`.
+        Parameters:
+            conv_states (`torch.Tensor`):
+                The new conv states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+        Return:
+            `torch.Tensor`: The updated conv states.
+        """
+        # NOTE: if we slightly break `update` arg order, we could combine this with it, and allow offloading support
+        # out of the box
+        if not isinstance(self.layers[layer_idx], LinearAttentionCacheLayerMixin):
+            raise ValueError("Cannot call `update_conv_state` on a non-LinearAttention layer!")
+        conv_states = self.layers[layer_idx].update_conv_state(conv_states, **kwargs)
+        return conv_states
+    def update_recurrent_state(self, recurrent_states: torch.Tensor, layer_idx: int, **kwargs) -> torch.Tensor:
+        """
+        Updates the cache with the new `recurrent_states` for the layer `layer_idx`.
+        Parameters:
+            smm_states (`torch.Tensor`):
+                The new ssm states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+        Return:
+            `torch.Tensor`: The updated ssm states.
+        """
+        # NOTE: if we slightly break `update` arg order, we could combine this with it, and allow offloading support
+        # out of the box
+        if not isinstance(self.layers[layer_idx], LinearAttentionCacheLayerMixin):
+            raise ValueError("Cannot call `update_conv_state` on a non-LinearAttention layer!")
+        recurrent_states = self.layers[layer_idx].update_recurrent_state(recurrent_states, **kwargs)
+        return recurrent_states
+    def early_initialization(
+        self,
+        batch_size: int,
+        num_heads: int | list[int],
+        head_dim: int | list[int],
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        """
+        Initialize all the layers in advance (it's otherwise lazily initialized on the first `update` call).
+        This is useful for our `export` recipes, as `export` needs everything in advance.
+        """
+        # To allow different num_heads and head_dim depending on layers, we accept lists
+        if isinstance(num_heads, int):
+            num_heads = [num_heads] * len(self)
+        if isinstance(head_dim, int):
+            head_dim = [head_dim] * len(self)
+        if len(num_heads) != len(self.layers):
+            raise ValueError(
+                f"`num_head` was provided as a list of length {len(num_heads)}, but the Cache currently has {len(self.layers)} layers"
+            )
+        if len(head_dim) != len(self.layers):
+            raise ValueError(
+                f"`head_dim` was provided as a list of length {len(num_heads)}, but the Cache currently has {len(self.layers)} layers"
+            )
+        for layer, layer_num_heads, layer_head_dim in zip(self.layers, num_heads, head_dim):
+            # Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
+            # this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
+            # creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
+            fake_kv_tensor = torch.zeros((batch_size, layer_num_heads, 0, layer_head_dim), dtype=dtype, device=device)
+            # Init the layer
+            layer.lazy_initialization(fake_kv_tensor, fake_kv_tensor)
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        """Returns the sequence length of the cache for the given layer."""
+        if layer_idx >= len(self.layers):
+            return 0
+        # For alternating attention/linear attention  caches, `get_seq_length` needs to use attention layer idx when called with default layer_idx
+        if not isinstance(self.layers[layer_idx], CacheLayerMixin):
+            # If this is called with non-default arg, raise
+            if layer_idx != 0:
+                raise ValueError(
+                    f"You called `get_seq_length` on layer index {layer_idx}, but this layer is a LinearAttention layer, which "
+                    "does not track sequence length."
+                )
+            try:
+                # Use the first attention layer
+                layer_idx = next(idx for idx in range(len(self)) if isinstance(self.layers[idx], CacheLayerMixin))
+            except StopIteration:
+                raise ValueError(
+                    "`get_seq_length` can only be called on Attention layers, and the current Cache seem to only contain "
+                    "LinearAttention layers."
+                )
+        return self.layers[layer_idx].get_seq_length()
+    def has_previous_state(self, layer_idx: int | None = None) -> bool:
+        """Returns whether the LinearAttention layer at index `layer_idx` has previous state or not."""
+        if layer_idx is not None and layer_idx >= len(self.layers):
+            return False
+        # In this case, use last LinearAttention layer
+        if layer_idx is None:
+            try:
+                layer_idx = next(
+                    idx
+                    for idx in range(len(self) - 1, -1, -1)
+                    if isinstance(self.layers[idx], LinearAttentionCacheLayerMixin)
+                )
+            except StopIteration:
+                raise ValueError(
+                    "`has_previous_state` can only be called on LinearAttention layers, and the current Cache seem to "
+                    "only contain Attention layers."
+                )
+        elif not isinstance(self.layers[layer_idx], LinearAttentionCacheLayerMixin):
+            raise ValueError(
+                f"You called `has_previous_state` on layer index {layer_idx}, but this layer is an Attention layer, which "
+                "does not support calling it."
+            )
+        return self.layers[layer_idx].has_previous_state
+    def get_mask_sizes(self, query_length: int, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns for each layer.
+        """
+        # For DynamicCache, where the layers are created at runtime -> if it was not yet created, the size is
+        # simply the query_length
+        if layer_idx >= len(self.layers):
+            return query_length, 0
+        # For alternating attention/linear attention caches, `get_mask_sizes` needs to use attention layer idx when called with default layer_idx
+        if not isinstance(self.layers[layer_idx], CacheLayerMixin):
+            # If this is called with non-default arg, raise
+            if layer_idx != 0:
+                raise ValueError(
+                    f"You called `get_mask_sizes` on layer index {layer_idx}, but this layer is a LinearAttention layer, which "
+                    "does not track sequence length."
+                )
+            try:
+                # Use the first attention layer
+                layer_idx = next(idx for idx in range(len(self)) if isinstance(self.layers[idx], CacheLayerMixin))
+            except StopIteration:
+                raise ValueError(
+                    "`get_mask_sizes` can only be called on Attention layers, and the current Cache seem to only contain "
+                    "LinearAttention layers."
+                )
+        return self.layers[layer_idx].get_mask_sizes(query_length)
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        """Returns maximum sequence length of the cache object. Dynamic caches do not have a maximum length."""
+        # For DynamicCache, where the layers are created at runtime -> if it was not yet created, return -1
+        # as DynamicLayer does
+        if layer_idx >= len(self.layers):
+            return -1
+        return self.layers[layer_idx].get_max_cache_shape()
+    def reset(self):
+        """Recursively reset all layers tensors"""
+        for layer_idx in range(len(self.layers)):
+            self.layers[layer_idx].reset()
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorder the cache for beam search"""
+        for layer_idx in range(len(self.layers)):
+            self.layers[layer_idx].reorder_cache(beam_idx)
+    def crop(self, max_length: int):
+        """Crop the cache to the given length"""
+        for layer_idx in range(len(self.layers)):
+            self.layers[layer_idx].crop(max_length)
+    def batch_repeat_interleave(self, repeats: int):
+        """Repeat and interleave the cache"""
+        for layer_idx in range(len(self.layers)):
+            self.layers[layer_idx].batch_repeat_interleave(repeats)
+    def batch_select_indices(self, indices: torch.Tensor):
+        """Select indices from the cache"""
+        for layer_idx in range(len(self.layers)):
+            self.layers[layer_idx].batch_select_indices(indices)
+    @property
+    def max_batch_size(self) -> int:
+        """Return the maximum batch size of the cache"""
+        values = [layer.max_batch_size for layer in self.layers]
+        if len(set(values)) > 1:
+            raise ValueError(f"Max batch size is not consistent across layers: {values}")
+        return values[0]
+    @property
+    def max_cache_len(self) -> int:
+        """Return the maximum cache length of the cache"""
+        values = [layer.max_cache_len for layer in self.layers]
+        return max(values)
+    @property
+    def is_compileable(self) -> bool:
+        """Return whether the cache is compilable"""
+        # For DynamicCache dispatching the layers lazily (otherwise, all([]) is True)
+        if len(self.layers) == 0:
+            return False
+        return all(layer.is_compileable for layer in self.layers)
+    @property
+    def is_initialized(self) -> bool:
+        """Return whether the cache data is initialized"""
+        return len(self.layers) > 0 and all(layer.is_initialized for layer in self.layers)
+    @property
+    def is_sliding(self) -> list[bool]:
+        """Return whether the layers of the cache are sliding window"""
+        return [getattr(layer, "is_sliding", False) for layer in self.layers]
+    def __len__(self):
+        """
+        This value corresponds to the number of layers in the model.
+        """
+        # Note: for DynamicCache, layers are initialized lazily, so this will not be accurate before the first
+        # forward through all the layers
+        return len(self.layers)
+class DynamicCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the key and value states as a list of `CacheLayer`, one for each layer. The expected shape for each tensor
+    in the `CacheLayer`s is `[batch_size, num_heads, seq_len, head_dim]`.
+    If a config is passed, it will additionally check for sliding or hybrid cache structure, greatly reducing the
+    memory requirement of the cached tensors to `[batch_size, num_heads, min(seq_len, sliding_window), head_dim]`.
+    See `Cache` for details on common methods that are implemented by all cache classes.
+    Args:
+        ddp_cache_data (`Iterable[tuple[torch.Tensor, torch.Tensor]]`, *optional*):
+            It was originally added for compatibility with `torch.distributed` (DDP). In a nutshell, it is
+            `map(gather_map, zip(*caches))`, i.e. each item in the iterable contains the key and value states
+            for a layer gathered across replicas by torch.distributed (shape=[global batch size, num_heads, seq_len, head_dim]).
+            Note: it needs to be the 1st arg as well to work correctly
+        config (`PreTrainedConfig`, *optional*):
+            The config of the model for which this Cache will be used. If passed, it will be used to check for sliding
+            or hybrid layer structure, greatly reducing the memory requirement of the cached tensors to
+            `[batch_size, num_heads, min(seq_len, sliding_window), head_dim]`.
+        offloading (`bool`, *optional*, defaults to `False`):
+            Whether to perform offloading of the layers to `cpu`, to save GPU memory.
+        offload_only_non_sliding (`bool`, *optional*, defaults to `False`):
+            If `offloading` is `True`, this further decides if only the non-sliding layers will be offloaded (because
+            usually the sliding layers are small in size, so there is no need to offload them, and skipping it is faster).
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+    >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+    >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")
+    >>> # Prepare a cache class and pass it to model's forward
+    >>> past_key_values = DynamicCache(config=model.config)
+    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+    >>> outputs.past_key_values # access cache filled with key/values from generation
+    ```
+    """
+    def __init__(
+        self,
+        ddp_cache_data: Iterable[tuple[torch.Tensor | None, ...]] | None = None,
+        config: PreTrainedConfig | None = None,
+        offloading: bool = False,
+        offload_only_non_sliding: bool = False,
+    ):
+        layers = []
+        # If a config is passed, use it to infer the layer types and initialize accordingly
+        if config is not None:
+            decoder_config = config.get_text_config(decoder=True)
+            sliding_window = getattr(decoder_config, "sliding_window", None) or getattr(
+                decoder_config, "attention_chunk_size", None
+            )
+            layer_types = getattr(decoder_config, "layer_types", None)
+            if layer_types is None:
+                layer_types = []
+                for _ in range(decoder_config.num_hidden_layers):
+                    if sliding_window is not None:
+                        layer_types.append("sliding_attention")
+                    else:
+                        layer_types.append("full_attention")
+            # Some models have shared layers thus no cache is needed for them (e.g. Gemma3n)
+            if hasattr(decoder_config, "num_kv_shared_layers"):
+                layer_types = layer_types[: -decoder_config.num_kv_shared_layers]
+            for layer_type in layer_types:
+                cache_cls = LAYER_TYPE_CACHE_MAPPING.get(layer_type, DynamicLayer)
+                layers.append(cache_cls(decoder_config))
+        # In this case, use the passed data to already fill in the Cache
+        if ddp_cache_data is not None:
+            # Init all the layers with the data
+            for layer_idx, kv_and_optional_sliding in enumerate(ddp_cache_data):
+                # If the config was not passed above, initialize a new cache layer for each entry of the ddp_data
+                if config is None:
+                    # kv_and_optional_sliding contains at least two elements: the key and value states. It can also
+                    # contain a third element, which is an optional sliding window tensor.
+                    sliding_window_tensor = kv_and_optional_sliding[2] if len(kv_and_optional_sliding) == 3 else None
+                    # If there is a sliding window tensor, use it to initialize the layer
+                    if sliding_window_tensor is not None:
+                        # Since the same layer is dispatched across replicas, sliding_window is the same for all
+                        sliding_window = sliding_window_tensor[0].item()
+                        layers.append(DynamicSlidingWindowLayer(sliding_window=sliding_window))
+                    else:
+                        layers.append(DynamicLayer())
+                # Update the layer with the data
+                _, _ = layers[layer_idx].update(kv_and_optional_sliding[0], kv_and_optional_sliding[1])
+        # If neither of config nor ddp_data was passed, then simply lazy init a full cache of DynamicLayer
+        if len(layers) == 0:
+            super().__init__(
+                layer_class_to_replicate=DynamicLayer,
+                offloading=offloading,
+                offload_only_non_sliding=offload_only_non_sliding,
+            )
+        else:
+            super().__init__(layers=layers, offloading=offloading, offload_only_non_sliding=offload_only_non_sliding)
+    def __iter__(self):
+        for layer in self.layers:
+            yield layer.keys, layer.values, getattr(layer, "_sliding_window_tensor", None)
+class StaticCache(Cache):
+    """
+    Static Cache class to be used with `torch.compile(model)` and `torch.export()`. It will check the `config`
+    for potential hybrid cache structure, and initialize each layer accordingly.
+    See `Cache` for details on common methods that are implemented by all cache classes.
+    Args:
+        config (`PreTrainedConfig`):
+            The config of the model for which this Cache will be used. It will be used to check for sliding
+            or hybrid layer structure, and initialize each layer accordingly.
+        max_cache_len (`int`):
+            The maximum number of tokens that this Cache should hold.
+        offloading (`bool`, *optional*, defaults to `False`):
+            Whether to perform offloading of the layers to `cpu`, to save GPU memory.
+        offload_only_non_sliding (`bool`, *optional*, defaults to `True`):
+            If `offloading` is `True`, this further decides if only the non-sliding layers will be offloaded (because
+            usually the sliding layers are small in size, so there is no need to offload them, and skipping it is faster).
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
+    >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+    >>> inputs = tokenizer(text="My name is Llama", return_tensors="pt")
+    >>> # Prepare a cache class and pass it to model's forward
+    >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
+    >>> max_generated_length = inputs.input_ids.shape[1] + 10
+    >>> past_key_values = StaticCache(config=model.config, max_cache_len=max_generated_length)
+    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+    >>> outputs.past_key_values # access cache filled with key/values from generation
+    StaticCache()
+    ```
+    """
+    # Pass-in kwargs as well to avoid crashing for BC (it used more arguments before)
+    def __init__(
+        self,
+        config: PreTrainedConfig,
+        max_cache_len: int,
+        offloading: bool = False,
+        offload_only_non_sliding: bool = True,
+        **kwargs,
+    ):
+        config = config.get_text_config(decoder=True)
+        layer_types = getattr(config, "layer_types", None)
+        # If `layer_types` is not explicitly provided, infer if the model is fully sliding
+        if layer_types is None:
+            if getattr(config, "sliding_window", None) is not None:
+                layer_types = ["sliding_attention" for _ in range(config.num_hidden_layers)]
+            elif getattr(config, "attention_chunk_size", None) is not None:
+                layer_types = ["chunked_attention" for _ in range(config.num_hidden_layers)]
+            else:
+                layer_types = ["full_attention" for _ in range(config.num_hidden_layers)]
+        # Some models have shared layers thus no cache is needed for them (e.g. Gemma3n)
+        if hasattr(config, "num_kv_shared_layers"):
+            layer_types = layer_types[: -config.num_kv_shared_layers]
+        sliding_layer_types = {
+            name
+            for name, cls in LAYER_TYPE_CACHE_MAPPING.items()
+            if isinstance(cls, type) and issubclass(cls, DynamicSlidingWindowLayer) and name != "chunked_attention"
+        }
+        layers = []
+        for layer_type in layer_types:
+            if layer_type == "chunked_attention":
+                # From a cache point of view, both sliding and chunked are the same in how they should behave and how many
+                # states they should return - only the mask changes to make them different at the end!
+                layer = StaticSlidingWindowLayer(
+                    max_cache_len=max_cache_len, sliding_window=config.attention_chunk_size
+                )
+            elif layer_type in sliding_layer_types:
+                layer = StaticSlidingWindowLayer(max_cache_len=max_cache_len, sliding_window=config.sliding_window)
+            # LinearAttention layers are static by essence - using `"moe"` as well is a trick, see the comment about it on DynamicCache
+            elif layer_type in ("mamba", "conv", "linear_attention", "moe"):
+                layer = LinearAttentionLayer()
+            else:
+                layer = StaticLayer(max_cache_len=max_cache_len)
+            layers.append(layer)
+        super().__init__(layers=layers, offloading=offloading, offload_only_non_sliding=offload_only_non_sliding)
+class QuantizedCache(Cache):
+    """
+    A quantizer cache similar to what is described in the
+    [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://huggingface.co/papers/2402.02750).
+    It allows the model to generate longer sequence length without allocating too much memory for keys and values
+    by applying quantization.
+    The cache has two types of storage, one for original precision and one for the
+    quantized cache. A `residual length` is set as a maximum capacity for the original precision cache. When the
+    length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache.
+    The quantization is done per-channel with a set `q_group_size` for both keys and values, in contrast to what was
+    described in the paper.
+    See `Cache` for details on common methods that are implemented by all cache classes.
+    Args:
+        backend (`str`):
+            The quantization backend to use. One of `("quanto", "hqq").
+        config (`PreTrainedConfig`):
+            The config of the model for which this Cache will be used.
+        nbits (`int`, *optional*, defaults to 4):
+            The number of bits for quantization.
+        axis_key (`int`, *optional*, defaults to 0):
+            The axis on which to quantize the keys.
+        axis_value (`int`, *optional*, defaults to 0):
+            The axis on which to quantize the values.
+        q_group_size (`int`, *optional*, defaults to 64):
+            Quantization is done per-channel according to a set `q_group_size` for both keys and values.
+        residual_length (`int`, *optional*, defaults to 128):
+            Maximum capacity for the original precision cache
+    """
+    def __init__(
+        self,
+        backend: str,
+        config: PreTrainedConfig,
+        nbits: int = 4,
+        axis_key: int = 0,
+        axis_value: int = 0,
+        q_group_size: int = 64,
+        residual_length: int = 128,
+    ):
+        if backend == "quanto":
+            layer_class = QuantoQuantizedLayer
+        elif backend == "hqq":
+            layer_class = HQQQuantizedLayer
+        else:
+            raise ValueError(f"Unknown quantization backend `{backend}`")
+        config = config.get_text_config(decoder=True)
+        layers = [
+            layer_class(nbits, axis_key, axis_value, q_group_size, residual_length)
+            for _ in range(config.num_hidden_layers)
+        ]
+        super().__init__(layers=layers)
+class EncoderDecoderCache(Cache):
+    """
+    Base, abstract class for all encoder-decoder caches. Can be used to hold combinations of self-attention and
+    cross-attention caches.
+    See `Cache` for details on common methods that are implemented by all cache classes.
+    Args:
+        caches (`Iterable`):
+            Usually an iterable of length 2, containing 2 `Cache` objects, the first one for self-attention, the
+            second one for cross-attention. Can optionally also be an iterable of length 1, containing a
+            `tuple[tuple[torch.Tensor]]` (usually used for compatibility with torch dp and ddp).
+    Example:
+    ```python
+    >>> from transformers import AutoProcessor, AutoModelForCausalLM, DynamicCache, EncoderDecoderCache
+    >>> model = AutoModelForCausalLM.from_pretrained("openai/whisper-small")
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-small")
+    >>> inputs = processor(audio=YOUR-AUDIO, return_tensors="pt")
+    >>> # Prepare cache classes for encoder and decoder and pass it to model's forward
+    >>> self_attention_cache = DynamicCache(config=self.config)
+    >>> cross_attention_cache = DynamicCache(config=self.config)
+    >>> past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache)
+    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+    >>> outputs.past_key_values # access cache filled with key/values from generation
+    EncoderDecoderCache()
+    ```
+    """
+    def __init__(self, *caches) -> None:
+        # For dp and ddp support, if only one argument is passed, it should be an iterable of DynamicCache ddp data
+        if len(caches) == 1:
+            self_attention_cache_data, cross_attention_cache_data = [], []
+            for combined_cache_data in caches[0]:
+                if len(combined_cache_data) == 6:  # two tuple of style (self_attn_k, self_attn_v, self_attn_sliding)
+                    self_attention_cache_data.append(combined_cache_data[:3])
+                    cross_attention_cache_data.append(combined_cache_data[3:])
+                # To support old DDP-style init, we handle the case where the tuple has no sliding window tensor
+                elif len(combined_cache_data) == 4:  # two tuple of style (self_attn_k, self_attn_v)
+                    self_attention_cache_data.append(combined_cache_data[:2])
+                    cross_attention_cache_data.append(combined_cache_data[2:])
+                else:
+                    raise ValueError(f"Expected {len(combined_cache_data) = } to be 4 or 6.\n{combined_cache_data = }")
+            self.self_attention_cache = DynamicCache(self_attention_cache_data)
+            self.cross_attention_cache = DynamicCache(cross_attention_cache_data)
+        # Otherwise, we should get two arguments, a self-attention cache and a cross-attention cache
+        elif len(caches) == 2:
+            if not isinstance(caches[0], Cache) or not isinstance(caches[1], Cache):
+                raise TypeError(f"One of the two arguments is not a Cache: {type(caches[0]) = }, {type(caches[1]) = }")
+            self.self_attention_cache = caches[0]
+            self.cross_attention_cache = caches[1]
+        # Error case
+        else:
+            raise ValueError(f"Expected 1 or 2 arguments, got {len(caches)}")
+        self.is_updated = {}
+        for layer_idx in range(len(self.cross_attention_cache)):
+            self.is_updated[layer_idx] = bool(self.cross_attention_cache.get_seq_length(layer_idx) > 0)
+    def __iter__(self):
+        """Returns tuples of style (self_attn_k, self_attn_v, self_attn_sliding, cross_attn_k, cross_attn_v, cross_attn_sliding)"""
+        for self_attention_layer, cross_attention_layer in zip(self.self_attention_cache, self.cross_attention_cache):
+            yield self_attention_layer + cross_attention_layer
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(self_attention_cache={self.self_attention_cache}, cross_attention_cache="
+            f"{self.cross_attention_cache})"
+        )
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_values` length, e.g. `len(past_key_values)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.self_attention_cache)
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        return self.self_attention_cache.get_seq_length(layer_idx)
+    def reset(self):
+        self.self_attention_cache.reset()
+        self.cross_attention_cache.reset()
+        for layer_idx in self.is_updated:
+            self.is_updated[layer_idx] = False
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        self.self_attention_cache.reorder_cache(beam_idx)
+        self.cross_attention_cache.reorder_cache(beam_idx)
+    def check_dynamic_cache(self, method: str):
+        if not (
+            isinstance(self.self_attention_cache, DynamicCache)
+            and isinstance(self.cross_attention_cache, DynamicCache)
+        ):
+            raise TypeError(
+                f"`{method}` is only defined for dynamic cache, got {self.self_attention_cache.__str__()} for the self "
+                f"attention cache and {self.cross_attention_cache.__str__()} for the cross attention cache."
+            )
+    # TODO(gante, sanchit-gandhi): move following functionality into `.generate`
+    def crop(self, maximum_length: int):
+        """
+        Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be
+        negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search (on the Hub).
+        """
+        self.check_dynamic_cache(self.crop.__name__)
+        self.self_attention_cache.crop(maximum_length)
+    def batch_repeat_interleave(self, repeats: int):
+        """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search (on the Hub)."""
+        self.check_dynamic_cache(self.batch_repeat_interleave.__name__)
+        self.self_attention_cache.batch_repeat_interleave(repeats)
+        self.cross_attention_cache.batch_repeat_interleave(repeats)
+    def batch_select_indices(self, indices: torch.Tensor):
+        """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search (on the Hub)."""
+        self.check_dynamic_cache(self.batch_select_indices.__name__)
+        self.self_attention_cache.batch_select_indices(indices)
+        self.cross_attention_cache.batch_select_indices(indices)
+    def get_max_cache_shape(self) -> int:
+        """Returns the maximum sequence length (i.e. max capacity) of the cache object"""
+        return self.self_attention_cache.get_max_cache_shape()
+    def get_mask_sizes(self, query_length: int, layer_idx: int) -> tuple[int, int]:
+        return self.self_attention_cache.get_mask_sizes(query_length, layer_idx)
+    @property
+    def is_sliding(self):
+        return self.self_attention_cache.is_sliding
+    @property
+    def is_compileable(self) -> bool:
+        return self.self_attention_cache.is_compileable
+# Deprecated alias: SlidingWindowCache was removed in transformers v5. StaticCache is the replacement.
+SlidingWindowCache = StaticCache

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/configuration_utils.py ADDED Viewed

	@@ -0,0 +1,1365 @@

+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configuration base class and utilities."""
+import copy
+import json
+import math
+import os
+from collections.abc import Sequence
+from dataclasses import MISSING, dataclass, fields
+from functools import wraps
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, Union
+from huggingface_hub import create_repo
+from huggingface_hub.dataclasses import strict
+from packaging import version
+from typing_extensions import dataclass_transform
+from . import __version__
+from .dynamic_module_utils import custom_object_save
+from .generation.configuration_utils import GenerationConfig
+from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
+from .modeling_rope_utils import RotaryEmbeddingConfigMixin
+from .utils import (
+    CONFIG_NAME,
+    PushToHubMixin,
+    cached_file,
+    copy_func,
+    extract_commit_hash,
+    is_torch_available,
+    logging,
+)
+from .utils.generic import is_timm_config_dict
+if TYPE_CHECKING:
+    import torch
+logger = logging.get_logger(__name__)
+# type hinting: specifying the type of config class that inherits from PreTrainedConfig
+SpecificPreTrainedConfigType = TypeVar("SpecificPreTrainedConfigType", bound="PreTrainedConfig")
+_FLOAT_TAG_KEY = "__float__"
+_FLOAT_TAG_VALUES = {"Infinity": float("inf"), "-Infinity": float("-inf"), "NaN": float("nan")}
+ALLOWED_LAYER_TYPES = (
+    "full_attention",
+    "sliding_attention",
+    "chunked_attention",
+    "compressed_sparse_attention",  # CSA, used in deepseek_v4
+    "heavily_compressed_attention",  # HCA, used in deepseek_v4
+    "linear_attention",  # used in minimax
+    "conv",  # used in LFMv2
+    "mamba",
+    "attention",
+    "sparse",
+    "dense",
+    "hybrid",  # for layers that have both mamba and attention in zamba and zamba2
+    "moe",  # for nemotron_h, which uses either attention, mamba or moe
+)
+# copied from huggingface_hub.dataclasses.strict when `accept_kwargs=True`
+def wrap_init_to_accept_kwargs(cls: dataclass):
+    # Get the original dataclass-generated __init__
+    original_init = cls.__init__
+    @wraps(original_init)
+    def __init__(self, *args, **kwargs: Any) -> None:
+        # Extract only the fields that are part of the dataclass
+        dataclass_fields = {f.name for f in fields(cls)}
+        standard_kwargs = {k: v for k, v in kwargs.items() if k in dataclass_fields}
+        # We need to call bare `__init__` without `__post_init__` but the `original_init` of
+        # any dataclas contains a call to post-init at the end (without kwargs)
+        if len(args) > 0:
+            raise ValueError(
+                f"{cls.__name__} accepts only keyword arguments, but found `{len(args)}` positional args."
+            )
+        for f in fields(cls):  # type: ignore
+            if f.name in standard_kwargs:
+                setattr(self, f.name, standard_kwargs[f.name])
+            elif f.default is not MISSING:
+                setattr(self, f.name, f.default)
+            elif f.default_factory is not MISSING:
+                setattr(self, f.name, f.default_factory())
+            else:
+                raise TypeError(f"Missing required field - '{f.name}'")
+        # Pass any additional kwargs to `__post_init__` and let the object
+        # decide whether to set the attr or use for different purposes (e.g. BC checks)
+        additional_kwargs = {}
+        for name, value in kwargs.items():
+            if name not in dataclass_fields:
+                additional_kwargs[name] = value
+        self.__post_init__(**additional_kwargs)
+    cls.__init__ = __init__
+    return cls
+@dataclass_transform(kw_only_default=True)
+@strict(accept_kwargs=True)
+@dataclass(repr=False)
+class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
+    # no-format
+    r"""
+    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
+    methods for loading/downloading/saving configurations.
+    <Tip>
+    A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+    initialize a model does **not** load the model weights. It only affects the model's configuration.
+    </Tip>
+    Class attributes (overridden by derived classes):
+    - **model_type** (`str`) -- An identifier for the model type, serialized into the JSON file, and used to recreate
+      the correct object in [`~transformers.AutoConfig`].
+    - **has_no_defaults_at_init** (`bool`) -- Whether the config class can be initialized without providing input arguments.
+      Some configurations requires inputs to be defined at init and have no default values, usually these are composite configs,
+      (but not necessarily) such as [`~transformers.EncoderDecoderConfig`] or [`~RagConfig`]. They have to be initialized from
+      two or more configs of type [`~transformers.PreTrainedConfig`].
+    - **keys_to_ignore_at_inference** (`list[str]`) -- A list of keys to ignore by default when looking at dictionary
+      outputs of the model during inference.
+    - **attribute_map** (`dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
+      naming of attributes.
+    - **base_model_tp_plan** (`dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
+      parallel plan applied to the sub-module when `model.tensor_parallel` is called.
+    - **base_model_pp_plan** (`dict[str, tuple[list[str]]]`) -- A dict that maps child-modules of a base model to a
+      pipeline parallel plan that enables users to place the child-module on the appropriate device.
+    Common attributes (present in all subclasses):
+    - **vocab_size** (`int`) -- The number of tokens in the vocabulary, which is also the first dimension of the
+      embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT).
+    - **hidden_size** (`int`) -- The hidden size of the model.
+    - **num_attention_heads** (`int`) -- The number of attention heads used in the multi-head attention layers of the
+      model.
+    - **num_hidden_layers** (`int`) -- The number of blocks in the model.
+    <Tip warning={true}>
+    Setting parameters for sequence generation in the model config is deprecated. For backward compatibility, loading
+    some of them will still be possible, but attempting to overwrite them will throw an exception -- you should set
+    them in a [~transformers.GenerationConfig]. Check the documentation of [~transformers.GenerationConfig] for more
+    information about the individual parameters.
+    </Tip>
+    Arg:
+        name_or_path (`str`, *optional*, defaults to `""`):
+            Store the string that was passed to [`PreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path`
+            if the configuration was created with such a method.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return all hidden-states.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should returns all attentions.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return a [`~transformers.utils.ModelOutput`] instead of a plain tuple.
+        is_encoder_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as an encoder/decoder or not.
+        chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
+            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
+            the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
+            sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
+            Forward Chunking work?](../glossary.html#feed-forward-chunking).
+        > Parameters for fine-tuning tasks
+        architectures (`list[str]`, *optional*):
+            Model architectures that can be used with the model pretrained weights.
+        id2label (`dict[int, str]`, *optional*):
+            A map from index (for instance prediction index, or target index) to label.
+        label2id (`dict[str, int]`, *optional*):
+            A map from label to index for the model.
+        num_labels (`int`, *optional*):
+            Number of labels to use in the last layer added to the model, typically for a classification task.
+        problem_type (`str`, *optional*):
+            Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
+            `"single_label_classification"` or `"multi_label_classification"`.
+        > PyTorch specific parameters
+        dtype (`str`, *optional*):
+            The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
+            (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
+            model is `float16`, ideally we want to load it back using the minimal amount of memory needed to load
+            `float16` weights.
+    """
+    # Class attributes that we don't want to save or have in `self.__dict__`
+    # They are not supposed to be set/changed by users. Each field is set when
+    # creating a model class
+    base_config_key: ClassVar[str] = ""
+    sub_configs: ClassVar[dict[str, type["PreTrainedConfig"]]] = {}
+    has_no_defaults_at_init: ClassVar[bool] = False
+    keys_to_ignore_at_inference: ClassVar[list[str]] = []
+    attribute_map: ClassVar[dict[str, str]] = {}
+    base_model_tp_plan: ClassVar[dict[str, Any] | None] = None
+    base_model_pp_plan: ClassVar[dict[str, Sequence[list[str]]] | None] = None
+    base_model_ep_plan: ClassVar[dict[str, Sequence[list[str]]] | None] = None
+    _auto_class: ClassVar[str | None] = None
+    # Attributes set internally when saving and used to infer model
+    # class for `Auto` mapping
+    model_type: ClassVar[str] = ""
+    transformers_version: str | None = None
+    architectures: list[str] | None = None
+    # Common attributes for all models
+    output_hidden_states: bool | None = False
+    return_dict: bool | None = True
+    dtype: Union[str, "torch.dtype"] | None = None
+    chunk_size_feed_forward: int = 0
+    is_encoder_decoder: bool = False
+    # Fine-tuning task arguments
+    id2label: dict[int, str] | dict[str, str] | None = None
+    label2id: dict[str, int] | dict[str, str] | None = None
+    problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None
+    def __post_init__(self, **kwargs):
+        # BC for the `torch_dtype` argument instead of the simpler `dtype`
+        # Do not warn, as it would otherwise always be triggered since most configs on the hub have `torch_dtype`
+        if (torch_dtype := kwargs.pop("torch_dtype", None)) is not None:
+            # If both are provided, keep `dtype`
+            self.dtype = self.dtype if self.dtype is not None else torch_dtype
+        if self.dtype is not None and isinstance(self.dtype, str) and is_torch_available():
+            # we will start using self.dtype in v5, but to be consistent with
+            # from_pretrained's dtype arg convert it to an actual torch.dtype object
+            import torch
+            self.dtype = getattr(torch, self.dtype)
+        # Keep the default value of `num_labels=2` in case users have saved a classfier with 2 labels
+        # Our configs prev wouldn't save `id2label` for 2 labels because it is the default. In all other
+        # cases we expect the config dict to have an `id2label` field if it's a clf model, or not otherwise
+        if self.id2label is None:
+            self.num_labels = kwargs.get("num_labels", 2)
+        else:
+            if kwargs.get("num_labels") is not None and len(self.id2label) != kwargs.get("num_labels"):
+                logger.warning(
+                    f"You passed `num_labels={kwargs.get('num_labels')}` which is incompatible to "
+                    f"the `id2label` map of length `{len(self.id2label)}`."
+                )
+            # Keys are always strings in JSON so convert ids to int
+            self.id2label = {int(key): value for key, value in self.id2label.items()}
+        if self.problem_type == "single_label_classification" and self.num_labels == 1:
+            raise ValueError(
+                '`problem_type="single_label_classification"` requires `num_labels > 1`. For binary '
+                'classification use `num_labels=2`, or use `problem_type="regression"` for a '
+                "single-output regression head."
+            )
+        # BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format
+        if hasattr(self, "rope_parameters"):
+            kwargs = self.convert_rope_params_to_dict(**kwargs)
+        elif kwargs.get("rope_scaling") and kwargs.get("rope_theta"):
+            logger.warning(
+                f"{self.__class__.__name__} got `key=rope_scaling` in kwargs but hasn't set it as attribute. "
+                "For RoPE standardization you need to set `self.rope_parameters` in model's config. "
+            )
+            kwargs = self.convert_rope_params_to_dict(**kwargs)
+        # Parameters for sequence generation saved in the config are popped instead of loading them.
+        for parameter_name in GenerationConfig._get_default_generation_params().keys():
+            kwargs.pop(parameter_name, None)
+        # Name or path to the pretrained checkpoint
+        self._name_or_path = str(kwargs.pop("name_or_path", ""))
+        self._commit_hash = kwargs.pop("_commit_hash", None)
+        # Attention/Experts implementation to use, if relevant (it sets it recursively on sub-configs)
+        self._output_attentions: bool | None = kwargs.pop("output_attentions", False)
+        self._attn_implementation: str | None = kwargs.pop("attn_implementation", None)
+        self._experts_implementation: str | None = kwargs.pop("experts_implementation", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            # Check this to avoid deserializing problematic fields from hub configs - they should use the public field
+            if key not in ("_attn_implementation_internal", "_experts_implementation_internal"):
+                try:
+                    setattr(self, key, value)
+                except AttributeError as err:
+                    logger.error(f"Can't set {key} with value {value} for {self}")
+                    raise err
+    def __init_subclass__(cls, *args, **kwargs):
+        super().__init_subclass__(*args, **kwargs)
+        cls_has_custom_init = "__init__" in cls.__dict__
+        # kw_only=True ensures fields without defaults in subclasses can follow
+        # parent fields that have defaults (Python dataclass ordering rule).
+        # Config fields are always passed as keyword arguments, so this is safe.
+        cls = dataclass(cls, repr=False, kw_only=True)
+        if not cls_has_custom_init:
+            # Wrap all subclasses to accept arbitrary kwargs for BC
+            # only if the subclass has no custom `__init__`. Most
+            # remote code has an init defined, but some model are not
+            # See https://huggingface.co/hmellor/Ilama-3.2-1B/blob/main/configuration_ilama.py
+            cls = wrap_init_to_accept_kwargs(cls)
+    @property
+    def name_or_path(self) -> str | None:
+        return getattr(self, "_name_or_path", None)
+    @name_or_path.setter
+    def name_or_path(self, value):
+        self._name_or_path = str(value)  # Make sure that name_or_path is a string (for JSON encoding)
+    @property
+    def num_labels(self) -> int:
+        """
+        `int`: The number of labels for classification models.
+        """
+        return len(self.id2label) if self.id2label is not None else None
+    @num_labels.setter
+    def num_labels(self, num_labels: int):
+        # we do not store `num_labels` attribute in config, but instead
+        # compute it based on the length of the `id2label` map
+        if self.id2label is None or self.num_labels != num_labels:
+            self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
+            self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+    @property
+    def output_attentions(self):
+        """
+        `bool`: Whether or not the model should returns all attentions.
+        """
+        return self._output_attentions
+    @output_attentions.setter
+    def output_attentions(self, value: bool):
+        # If we set `output_attentions` explicitly before the attn implementation, dispatch eager
+        if value and self._attn_implementation is None:
+            self._attn_implementation = "eager"
+        if value and self._attn_implementation != "eager":
+            raise ValueError(
+                "The `output_attentions` attribute is not supported when using the `attn_implementation` set to "
+                f"{self._attn_implementation}. Please set it to 'eager' instead."
+            )
+        self._output_attentions = value
+    @property
+    def _attn_implementation(self):
+        return self._attn_implementation_internal
+    @_attn_implementation.setter
+    def _attn_implementation(self, value: str | dict | None):
+        """We set it recursively on the sub-configs as well"""
+        # Set if for current config
+        current_attn = getattr(self, "_attn_implementation", None)
+        attn_implementation = value if not isinstance(value, dict) else value.get("", current_attn)
+        self._attn_implementation_internal = attn_implementation
+        # Set it recursively on the subconfigs
+        for subconfig_key in self.sub_configs:
+            subconfig = getattr(self, subconfig_key, None)
+            if subconfig is not None:
+                current_subconfig_attn = getattr(subconfig, "_attn_implementation", None)
+                sub_implementation = (
+                    value if not isinstance(value, dict) else value.get(subconfig_key, current_subconfig_attn)
+                )
+                subconfig._attn_implementation = sub_implementation
+    @property
+    def _experts_implementation(self):
+        return self._experts_implementation_internal
+    @_experts_implementation.setter
+    def _experts_implementation(self, value: str | dict | None):
+        """We set it recursively on the sub-configs as well"""
+        # Set if for current config
+        current_moe = getattr(self, "_experts_implementation", None)
+        experts_implementation = value if not isinstance(value, dict) else value.get("", current_moe)
+        self._experts_implementation_internal = experts_implementation
+        # Set it recursively on the subconfigs
+        for subconfig_key in self.sub_configs:
+            subconfig = getattr(self, subconfig_key, None)
+            if subconfig is not None:
+                current_subconfig_moe = getattr(subconfig, "_experts_implementation", None)
+                sub_implementation = (
+                    value if not isinstance(value, dict) else value.get(subconfig_key, current_subconfig_moe)
+                )
+                subconfig._experts_implementation = sub_implementation
+    @property
+    def torch_dtype(self):
+        logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
+        return self.dtype
+    @property
+    def use_return_dict(self):
+        logger.warning_once("`use_return_dict` is deprecated! Use `return_dict` instead!")
+        return self.return_dict
+    @torch_dtype.setter
+    def torch_dtype(self, value):
+        logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
+        self.dtype = value
+    def __setattr__(self, key, value):
+        if key in super().__getattribute__("attribute_map"):
+            key = super().__getattribute__("attribute_map")[key]
+        super().__setattr__(key, value)
+    def __getattribute__(self, key):
+        if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
+            key = super().__getattribute__("attribute_map")[key]
+        return super().__getattribute__(key)
+    def validate_output_attentions(self):
+        if self.output_attentions and self._attn_implementation not in ["eager", None]:
+            raise ValueError(
+                "The `output_attentions` attribute is not supported when using the `attn_implementation` set to "
+                f"{self._attn_implementation}. Please set it to 'eager' instead."
+            )
+    def validate_architecture(self):
+        """Part of `@strict`-powered validation. Validates the architecture of the config."""
+        if (
+            hasattr(self, "head_dim")
+            and hasattr(self, "num_heads")
+            and hasattr(self, "embed_dim")
+            and self.head_dim * self.num_heads != self.embed_dim
+        ):
+            raise ValueError(
+                f"The embed_dim ({self.embed_dim}) is not a multiple of the number of attention "
+                f"heads ({self.num_heads})."
+            )
+    def validate_token_ids(self):
+        """Part of `@strict`-powered validation. Validates the contents of the special tokens."""
+        text_config = self.get_text_config(decoder=True)
+        vocab_size = getattr(text_config, "vocab_size", None)
+        if vocab_size is not None:
+            # Check for all special tokens, e..g. pad_token_id, image_token_id, audio_token_id
+            for name in text_config:
+                value = getattr(text_config, name)
+                if name.endswith("_token_id") and isinstance(value, int) and not 0 <= value < vocab_size:
+                    # Can't be an exception until we can load configs that fail validation: several configs on the Hub
+                    # store invalid special tokens, e.g. `pad_token_id=-1`
+                    logger.warning_once(
+                        f"Model config: {name} must be `None` or an integer within the vocabulary (between 0 "
+                        f"and {vocab_size - 1}), got {value}. This may result in unexpected behavior."
+                    )
+    def validate_layer_type(self):
+        """Check that `layer_types` is correctly defined."""
+        if not (getattr(self, "layer_types", None) is not None and hasattr(self, "num_hidden_layers")):
+            return
+        elif not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in self.layer_types):
+            raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES} but got {self.layer_types}")
+        elif self.num_hidden_layers is not None and self.num_hidden_layers != len(self.layer_types):
+            raise ValueError(
+                f"`num_hidden_layers` ({self.num_hidden_layers}) must be equal to the number of layer types "
+                f"({len(self.layer_types)})"
+            )
+    @property
+    def rope_scaling(self):
+        return self.rope_parameters
+    @rope_scaling.setter
+    def rope_scaling(self, value):
+        self.rope_parameters = value
+    def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
+        """
+        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~PreTrainedConfig.from_pretrained`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+        generation_parameters = self._get_generation_parameters()
+        if len(generation_parameters) > 0:
+            raise ValueError(
+                "Some generation parameters are set in the model config. These should go into `model.generation_config`"
+                f"as opposed to `model.config`. \nGeneration parameters found: {str(generation_parameters)}",
+            )
+        os.makedirs(save_directory, exist_ok=True)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
+            files_timestamps = self._get_files_timestamps(save_directory)
+        # This attribute is important to know on load, but should not be serialized on save.
+        if "transformers_weights" in self:
+            delattr(self, "transformers_weights")
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+        # Strict validation at save-time: prevent bad patterns from propagating
+        # Using `strict` decorator guarantees that `self.validate` exists , but not all
+        # model config might have the decorator added
+        if hasattr(self, "validate"):
+            self.validate()
+        self.to_json_file(output_config_file, use_diff=True)
+        logger.info(f"Configuration saved in {output_config_file}")
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+    @classmethod
+    def from_pretrained(
+        cls: type[SpecificPreTrainedConfigType],
+        pretrained_model_name_or_path: str | os.PathLike,
+        cache_dir: str | os.PathLike | None = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: str | bool | None = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> SpecificPreTrainedConfigType:
+        r"""
+        Instantiate a [`PreTrainedConfig`] (or a derived class) from a pretrained model configuration.
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+                - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a configuration file saved using the
+                  [`~PreTrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path to a saved configuration JSON *file*, e.g., `./my_model_directory/configuration.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the configuration files and override the cached versions if
+                they exist.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `hf auth login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+                <Tip>
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
+                </Tip>
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
+                part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
+            kwargs (`dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+        Returns:
+            [`PreTrainedConfig`]: The configuration object instantiated from this pretrained model.
+        Examples:
+        ```python
+        # We can't instantiate directly the base class *PreTrainedConfig* so let's show the examples on a
+        # derived class: BertConfig
+        config = BertConfig.from_pretrained(
+            "google-bert/bert-base-uncased"
+        )  # Download configuration from huggingface.co and cache.
+        config = BertConfig.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+        config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
+        config = BertConfig.from_pretrained("google-bert/bert-base-uncased", output_attentions=True, foo=False)
+        assert config.output_attentions == True
+        config, unused_kwargs = BertConfig.from_pretrained(
+            "google-bert/bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        )
+        assert config.output_attentions == True
+        assert unused_kwargs == {"foo": False}
+        ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if cls.base_config_key and cls.base_config_key in config_dict:
+            config_dict = config_dict[cls.base_config_key]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            # sometimes the config has no `base_config_key` if the config is used in several composite models
+            # e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning
+            for v in config_dict.values():
+                if isinstance(v, dict) and v.get("model_type") == cls.model_type:
+                    config_dict = v
+            # raise warning only if we still can't see a match in `model_type`
+            if config_dict["model_type"] != cls.model_type:
+                logger.warning(
+                    f"You are using a model of type `{config_dict['model_type']}` to instantiate a model of type "
+                    f"`{cls.model_type}`. This may be expected if you are loading a checkpoint that shares a subset "
+                    f"of the architecture (e.g., loading a `sam2_video` checkpoint into `Sam2Model`), but is otherwise "
+                    f"not supported and can yield errors. Please verify that the checkpoint is compatible with the "
+                    f"model you are instantiating."
+                )
+        return cls.from_dict(config_dict, **kwargs)
+    @classmethod
+    def get_config_dict(
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        [`PreTrainedConfig`] using `from_dict`.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+        Returns:
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
+        """
+        original_kwargs = copy.deepcopy(kwargs)
+        # Get config dict associated with the base config file
+        config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if config_dict is None:
+            return {}, kwargs
+        if "_commit_hash" in config_dict:
+            original_kwargs["_commit_hash"] = config_dict["_commit_hash"]
+        # That config file may point us toward another config file to use.
+        if "configuration_files" in config_dict:
+            configuration_file = get_configuration_file(config_dict["configuration_files"])
+            config_dict, kwargs = cls._get_config_dict(
+                pretrained_model_name_or_path, _configuration_file=configuration_file, **original_kwargs
+            )
+        return config_dict, kwargs
+    @classmethod
+    def _get_config_dict(
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        subfolder = kwargs.pop("subfolder", "")
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+        commit_hash = kwargs.pop("_commit_hash", None)
+        gguf_file = kwargs.get("gguf_file")
+        if trust_remote_code is True:
+            logger.warning(
+                "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
+                " ignored."
+            )
+        user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
+            # Special case when pretrained_model_name_or_path is a local file
+            resolved_config_file = pretrained_model_name_or_path
+            is_local = True
+        else:
+            configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME) if gguf_file is None else gguf_file
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    configuration_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _commit_hash=commit_hash,
+                )
+                if resolved_config_file is None:
+                    return None, kwargs
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
+            except OSError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise OSError(
+                    f"Can't load the configuration of '{pretrained_model_name_or_path}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
+                    f" name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory"
+                    f" containing a {configuration_file} file"
+                )
+        try:
+            if gguf_file:
+                config_dict = load_gguf_checkpoint(resolved_config_file, return_tensors=False)["config"]
+            else:
+                # Load config dict
+                config_dict = cls._dict_from_json_file(resolved_config_file)
+            config_dict["_commit_hash"] = commit_hash
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise OSError(f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file.")
+        if is_local:
+            logger.info(f"loading configuration file {resolved_config_file}")
+        else:
+            logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
+        # timm models are not saved with the model_type in the config file
+        if "model_type" not in config_dict and is_timm_config_dict(config_dict):
+            config_dict["model_type"] = "timm_wrapper"
+        # Some checkpoints may contain the wrong model_type in the config file.
+        # Allow the user to override it but warn them that it might not work.
+        if "model_type" in kwargs and config_dict["model_type"] != kwargs["model_type"]:
+            logger.warning(
+                f"{configuration_file} has 'model_type={config_dict['model_type']}' but you overrode "
+                f"it with 'model_type={kwargs['model_type']}'. This may lead to unexpected behavior."
+            )
+            config_dict["model_type"] = kwargs["model_type"]
+        return config_dict, kwargs
+    @classmethod
+    def from_dict(
+        cls: type[SpecificPreTrainedConfigType], config_dict: dict[str, Any], **kwargs
+    ) -> SpecificPreTrainedConfigType:
+        """
+        Instantiates a [`PreTrainedConfig`] from a Python dictionary of parameters.
+        Args:
+            config_dict (`dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the [`~PreTrainedConfig.get_config_dict`] method.
+            kwargs (`dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+        Returns:
+            [`PreTrainedConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
+        if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
+            kwargs.setdefault("_commit_hash", config_dict["_commit_hash"])
+        # To remove arg here are those passed along for our internal telemetry but we still need to remove them
+        to_remove = ["_from_auto", "_from_pipeline"]
+        valid_fields = [
+            "num_labels",
+            "attn_implementation",
+            "experts_implementation",
+            "output_attentions",
+            "torch_dtype",
+            "dtype",
+            "name_or_path",
+        ]
+        for key, value in kwargs.items():
+            if key in valid_fields:
+                if key not in ["torch_dtype", "dtype"]:
+                    config_dict[key] = value
+                    to_remove.append(key)
+                elif value != "auto":
+                    config_dict[key] = value
+        config = cls(**config_dict)
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                current_attr = getattr(config, key)
+                # To authorize passing a custom subconfig as kwarg in models that have nested configs.
+                # We need to update only custom kwarg values instead and keep other attr in subconfig.
+                if isinstance(current_attr, PreTrainedConfig) and isinstance(value, dict):
+                    current_attr_updated = current_attr.to_dict()
+                    current_attr_updated.update(value)
+                    value = current_attr.__class__(**current_attr_updated)
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        logger.info(f"Model config {config}")
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+    @classmethod
+    def from_json_file(
+        cls: type[SpecificPreTrainedConfigType], json_file: str | os.PathLike
+    ) -> SpecificPreTrainedConfigType:
+        """
+        Instantiates a [`PreTrainedConfig`] from the path to a JSON file of parameters.
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+        Returns:
+            [`PreTrainedConfig`]: The configuration object instantiated from that JSON file.
+        """
+        config_dict = cls._dict_from_json_file(json_file)
+        return cls(**config_dict)
+    @classmethod
+    def _dict_from_json_file(cls, json_file: str | os.PathLike):
+        with open(json_file, encoding="utf-8") as reader:
+            text = reader.read()
+        config_dict = json.loads(text)
+        return cls._decode_special_floats(config_dict)
+    @classmethod
+    def _encode_special_floats(cls, obj: Any) -> Any:
+        """
+        Iterates over the passed object and encode specific floats that cannot be JSON-serialized. Python's JSON
+        engine saves floats like `Infinity` (+/-) or `NaN` which are not compatible with other JSON engines.
+        It serializes floats like `Infinity` as an object: `{'__float__': Infinity}`.
+        """
+        if isinstance(obj, float):
+            if math.isnan(obj):
+                return {_FLOAT_TAG_KEY: "NaN"}
+            if obj == float("inf"):
+                return {_FLOAT_TAG_KEY: "Infinity"}
+            if obj == float("-inf"):
+                return {_FLOAT_TAG_KEY: "-Infinity"}
+            return obj
+        if isinstance(obj, dict):
+            return {k: cls._encode_special_floats(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [cls._encode_special_floats(v) for v in obj]
+        return obj
+    @classmethod
+    def _decode_special_floats(cls, obj: Any) -> Any:
+        """
+        Iterates over the passed object and decode specific floats that cannot be JSON-serialized. Python's JSON
+        engine saves floats like `Infinity` (+/-) or `NaN` which are not compatible with other JSON engines.
+        This method deserializes objects like `{'__float__': Infinity}` to their float values like `Infinity`.
+        """
+        if isinstance(obj, dict):
+            if set(obj.keys()) == {_FLOAT_TAG_KEY} and isinstance(obj[_FLOAT_TAG_KEY], str):
+                tag = obj[_FLOAT_TAG_KEY]
+                if tag in _FLOAT_TAG_VALUES:
+                    return _FLOAT_TAG_VALUES[tag]
+                return obj
+            return {k: cls._decode_special_floats(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [cls._decode_special_floats(v) for v in obj]
+        return obj
+    def __eq__(self, other):
+        return isinstance(other, PreTrainedConfig) and (self.__dict__ == other.__dict__)
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+    def __iter__(self):
+        yield from self.__dict__
+    def to_diff_dict(self) -> dict[str, Any]:
+        """
+        Removes all attributes from the configuration that correspond to the default config attributes for
+        better readability, while always retaining the `config` attribute from the class. Serializes to a
+        Python dictionary.
+        Returns:
+            dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
+        """
+        config_dict = self.to_dict()
+        # Get the default config dict (from a fresh PreTrainedConfig instance)
+        default_config_dict = PreTrainedConfig().to_dict()
+        # get class specific config dict
+        class_config_dict = self.__class__().to_dict() if not self.has_no_defaults_at_init else {}
+        serializable_config_dict = {}
+        # Only serialize values that differ from the default config,
+        # except always keep the 'config' attribute.
+        for key, value in config_dict.items():
+            if (
+                isinstance(getattr(self, key, None), PreTrainedConfig)
+                and key in class_config_dict
+                and isinstance(class_config_dict[key], dict)
+            ):
+                # For nested configs we need to clean the diff recursively
+                diff = recursive_diff_dict(value, default_config_dict, config_obj=getattr(self, key, None))
+                if "model_type" in value:
+                    # Needs to be set even if it's not in the diff
+                    diff["model_type"] = value["model_type"]
+                serializable_config_dict[key] = diff
+            elif (
+                key not in default_config_dict
+                or key == "transformers_version"
+                or key == "vocab_file"
+                or value != default_config_dict[key]
+                or (key in default_config_dict and value != class_config_dict.get(key, value))
+            ):
+                serializable_config_dict[key] = value
+        self._remove_keys_not_serialized(serializable_config_dict)
+        # Key removed only in diff dict
+        if "_name_or_path" in serializable_config_dict:
+            del serializable_config_dict["_name_or_path"]
+        if hasattr(self, "quantization_config"):
+            serializable_config_dict["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict) and self.quantization_config is not None
+                else self.quantization_config
+            )
+        self.dict_dtype_to_str(serializable_config_dict)
+        return serializable_config_dict
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+        Returns:
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        if hasattr(self.__class__, "model_type"):
+            output["model_type"] = self.__class__.model_type
+        # Transformers version when serializing the model
+        output["transformers_version"] = __version__
+        # Pop "kwargs" since they are unpacked and set in the post init
+        output.pop("kwargs", None)
+        def to_list(value):
+            if isinstance(value, tuple):
+                value = [to_list(item) for item in value]
+            return value
+        for key, value in output.items():
+            # Deal with nested configs like CLIP
+            if isinstance(value, PreTrainedConfig):
+                value = value.to_dict()
+                del value["transformers_version"]
+            # Some models have defaults as tuples because dataclass
+            # doesn't allow mutables. Let's convert back to `list``
+            elif isinstance(value, tuple):
+                value = to_list(value)
+            output[key] = value
+        self._remove_keys_not_serialized(output)
+        if hasattr(self, "quantization_config"):
+            output["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict) and self.quantization_config is not None
+                else self.quantization_config
+            )
+        self.dict_dtype_to_str(output)
+        return output
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PreTrainedConfig()`
+                is serialized to JSON string.
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        # Handle +/-Infinity and NaNs
+        config_dict = self._encode_special_floats(config_dict)
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+    def to_json_file(self, json_file_path: str | os.PathLike, use_diff: bool = True):
+        """
+        Save this instance to a JSON file.
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PreTrainedConfig()`
+                is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string(use_diff=use_diff))
+    def update(self, config_dict: dict[str, Any]):
+        """
+        Updates attributes of this class with attributes from `config_dict`.
+        Args:
+            config_dict (`dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)
+    def update_from_string(self, update_str: str):
+        """
+        Updates attributes of this class with attributes from `update_str`.
+        The expected format is ints, floats and strings as is, and for booleans use `true` or `false`. For example:
+        "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        The keys to change have to already exist in the config object.
+        Args:
+            update_str (`str`): String with attributes that should be updated for this class.
+        """
+        d = dict(x.split("=") for x in update_str.split(","))
+        for k, v in d.items():
+            if not hasattr(self, k):
+                raise ValueError(f"key {k} isn't in the original config dict")
+            old_v = getattr(self, k)
+            if isinstance(old_v, bool):
+                if v.lower() in ["true", "1", "y", "yes"]:
+                    v = True
+                elif v.lower() in ["false", "0", "n", "no"]:
+                    v = False
+                else:
+                    raise ValueError(f"can't derive true or false from {v} (key {k})")
+            elif isinstance(old_v, int):
+                v = int(v)
+            elif isinstance(old_v, float):
+                v = float(v)
+            elif not isinstance(old_v, str):
+                raise TypeError(
+                    f"You can only update int, float, bool or string values in the config, got {v} for key {k}"
+                )
+            setattr(self, k, v)
+    def dict_dtype_to_str(self, d: dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary and its nested dicts have a *dtype* key and if it's not None,
+        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
+        string, which can then be stored in the json format.
+        """
+        if d.get("dtype") is not None:
+            if isinstance(d["dtype"], dict):
+                d["dtype"] = {k: str(v).split(".")[-1] for k, v in d["dtype"].items()}
+            # models like Emu3 can have "dtype" as token in config's vocabulary map,
+            # so we also exclude int type here to avoid error in this special case.
+            elif not isinstance(d["dtype"], (str, int)):
+                d["dtype"] = str(d["dtype"]).split(".")[1]
+        for value in d.values():
+            if isinstance(value, dict):
+                self.dict_dtype_to_str(value)
+    def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None:
+        """
+        Checks and removes if there are any keys in the dict that should not be serialized when saving the config.
+        Runs recursive check on the dict, to remove from all sub configs.
+        """
+        for key_to_remove in [
+            "_is_quantized",
+            "_auto_class",
+            "_commit_hash",
+            "_attn_implementation_internal",
+            "_experts_implementation_internal",
+            "ignore_keys_at_rope_validation",
+            "base_model_tp_plan",
+            "base_model_pp_plan",
+        ]:
+            d.pop(key_to_remove, None)
+        if "_output_attentions" in d:
+            d["output_attentions"] = d.pop("_output_attentions")
+        for value in d.values():
+            if isinstance(value, dict):
+                self._remove_keys_not_serialized(value)
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoConfig"):
+        """
+        Register this class with a given auto class. This should only be used for custom configurations as the ones in
+        the library are already mapped with `AutoConfig`.
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoConfig"`):
+                The auto class to register this new configuration with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+        import transformers.models.auto as auto_module
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+        cls._auto_class = auto_class
+    def _get_generation_parameters(self) -> dict[str, Any]:
+        """
+        Checks if there are generation parameters in `PreTrainedConfig` instance. Note that
+        we should not save generation params in PreTrainedConfig, and we will raise error
+        if there are any.
+        """
+        generation_params = {}
+        default_config = self.__class__().to_dict() if not self.has_no_defaults_at_init else {}
+        for key in GenerationConfig._get_default_generation_params().keys():
+            if key == "use_cache":
+                continue  # common key for most models
+            if hasattr(self, key) and getattr(self, key) is not None and key not in default_config:
+                generation_params[key] = getattr(self, key)
+        return generation_params
+    def get_text_config(self, decoder=None, encoder=None) -> "PreTrainedConfig":
+        """
+        Returns the text config related to the text input (encoder) or text output (decoder) of the model. The
+        `decoder` and `encoder` input arguments can be used to specify which end of the model we are interested in,
+        which is useful on models that have both text input and output modalities.
+        There are three possible outcomes of using this method:
+        1. On most models, it returns the original config instance itself.
+        2. On newer (2024+) composite models, it returns the text section of the config, which is nested under a set
+            of valid names.
+        3. On older (2023-) composite models, it discards decoder-only parameters when `encoder=True` and vice-versa.
+        Args:
+            decoder (`Optional[bool]`, *optional*):
+                If set to `True`, then only search for decoder config names.
+            encoder (`Optional[bool]`, *optional*):
+                If set to `True`, then only search for encoder config names.
+        """
+        return_both = decoder == encoder  # both unset or both set -> search all possible names
+        decoder_possible_text_config_names = ("decoder", "generator", "text_config")
+        encoder_possible_text_config_names = ("text_encoder",)
+        if return_both:
+            possible_text_config_names = encoder_possible_text_config_names + decoder_possible_text_config_names
+        elif decoder:
+            possible_text_config_names = decoder_possible_text_config_names
+        else:
+            possible_text_config_names = encoder_possible_text_config_names
+        valid_text_config_names = []
+        for text_config_name in possible_text_config_names:
+            if hasattr(self, text_config_name):
+                text_config = getattr(self, text_config_name, None)
+                if text_config is not None:
+                    valid_text_config_names += [text_config_name]
+        if len(valid_text_config_names) > 1:
+            raise ValueError(
+                f"Multiple valid text configs were found in the model config: {valid_text_config_names}. In this "
+                "case, using `get_text_config()` would be ambiguous. Please specify the desired text config directly, "
+                "e.g. `text_config = config.sub_config_name`"
+            )
+        elif len(valid_text_config_names) == 1:
+            config_to_return = getattr(self, valid_text_config_names[0])
+        else:
+            config_to_return = self
+        # handle legacy models with flat config structure, when we only want one of the configs
+        if not return_both and len(valid_text_config_names) == 0 and config_to_return.is_encoder_decoder:
+            config_to_return = copy.deepcopy(config_to_return)
+            prefix_to_keep = "decoder" if decoder else "encoder"
+            for key in config_to_return.to_dict():
+                # NOTE: We can't discard keys because:
+                # 1) we can't truly delete a cls attribte on a dataclass; 2) we can't set the value to `None` due to
+                # strict validation. So we just keep it as is, since there are only a couple old models falling in this condition
+                if key.startswith(prefix_to_keep):
+                    # [encoder/decoder]_layers -> num_hidden_layers
+                    if key == prefix_to_keep + "_layers":
+                        new_key = "num_hidden_layers"
+                    # [encoder/decoder]_attention_heads -> num_attention_heads
+                    elif key == prefix_to_keep + "_attention_heads":
+                        new_key = "num_attention_heads"
+                    # e.g. encoder_hidden_act -> hidden_act
+                    else:
+                        new_key = key[len(prefix_to_keep) + 1 :]
+                    # Does the class map the new key into a different attribute name at read time? if so, let's write
+                    # into that attribute instead
+                    if new_key in config_to_return.attribute_map:
+                        new_key = config_to_return.attribute_map[new_key]
+                    value = getattr(config_to_return, key)
+                    delattr(config_to_return, key)
+                    setattr(config_to_return, new_key, value)
+        return config_to_return
+def get_configuration_file(configuration_files: list[str]) -> str:
+    """
+    Get the configuration file to use for this version of transformers.
+    Args:
+        configuration_files (`list[str]`): The list of available configuration files.
+    Returns:
+        `str`: The configuration file to use.
+    """
+    configuration_files_map = {}
+    for file_name in configuration_files:
+        if file_name.startswith("config.") and file_name.endswith(".json") and file_name != "config.json":
+            v = file_name.removeprefix("config.").removesuffix(".json")
+            configuration_files_map[v] = file_name
+    available_versions = sorted(configuration_files_map.keys())
+    # Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions.
+    configuration_file = CONFIG_NAME
+    transformers_version = version.parse(__version__)
+    for v in available_versions:
+        if version.parse(v) <= transformers_version:
+            configuration_file = configuration_files_map[v]
+        else:
+            # No point going further since the versions are sorted.
+            break
+    return configuration_file
+def recursive_diff_dict(dict_a, dict_b, config_obj=None):
+    """
+    Helper function to recursively take the diff between two nested dictionaries. The resulting diff only contains the
+    values from `dict_a` that are different from values in `dict_b`.
+    dict_b : the default config dictionary. We want to remove values that are in this one
+    """
+    diff = {}
+    default = config_obj.__class__().to_dict() if config_obj is not None else {}
+    for key, value in dict_a.items():
+        obj_value = getattr(config_obj, str(key), None)
+        if isinstance(obj_value, PreTrainedConfig) and key in dict_b and isinstance(dict_b[key], dict):
+            diff_value = recursive_diff_dict(value, dict_b[key], config_obj=obj_value)
+            diff[key] = diff_value
+        elif key not in dict_b or (value != default[key]):
+            diff[key] = value
+    return diff
+PreTrainedConfig.push_to_hub = copy_func(PreTrainedConfig.push_to_hub)
+if PreTrainedConfig.push_to_hub.__doc__ is not None:
+    PreTrainedConfig.push_to_hub.__doc__ = PreTrainedConfig.push_to_hub.__doc__.format(
+        object="config", object_class="AutoConfig", object_files="configuration file"
+    )
+# The alias is only here for BC - we did not have the correct CamelCasing before
+PretrainedConfig = PreTrainedConfig
+def layer_type_validation(layer_types: list[str], num_hidden_layers: int | None = None, attention: bool = True):
+    logger.warning(
+        "`layer_type_validation` is deprecated and will be removed in v5.20. "
+        "Use `PreTrainedConfig.validate_layer_type` instead"
+    )
+    if not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in layer_types):
+        raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES}")
+    if num_hidden_layers is not None and num_hidden_layers != len(layer_types):
+        raise ValueError(
+            f"`num_hidden_layers` ({num_hidden_layers}) must be equal to the number of layer types "
+            f"({len(layer_types)})"
+        )

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/file_utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+File utilities: utilities related to download and cache models
+This module should not be update anymore and is only left for backward compatibility.
+"""
+from . import __version__
+# Backward compatibility imports, to make sure all those objects can be found in file_utils
+from .utils import (
+    CLOUDFRONT_DISTRIB_PREFIX,
+    CONFIG_NAME,
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    FEATURE_EXTRACTOR_NAME,
+    HF_MODULES_CACHE,
+    MODEL_CARD_NAME,
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    S3_BUCKET_PREFIX,
+    SENTENCEPIECE_UNDERLINE,
+    SPIECE_UNDERLINE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    ContextManagers,
+    DummyObject,
+    EntryNotFoundError,
+    ExplicitEnum,
+    ModelOutput,
+    PaddingStrategy,
+    PushToHubMixin,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    TensorType,
+    _LazyModule,
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    copy_func,
+    define_sagemaker_information,
+    get_torch_version,
+    has_file,
+    http_user_agent,
+    is_apex_available,
+    is_bs4_available,
+    is_coloredlogs_available,
+    is_datasets_available,
+    is_detectron2_available,
+    is_faiss_available,
+    is_g2p_en_available,
+    is_in_notebook,
+    is_librosa_available,
+    is_onnx_available,
+    is_pandas_available,
+    is_phonemizer_available,
+    is_protobuf_available,
+    is_psutil_available,
+    is_py3nvml_available,
+    is_pyctcdecode_available,
+    is_pytesseract_available,
+    is_pytorch_quantization_available,
+    is_rjieba_available,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_scipy_available,
+    is_sentencepiece_available,
+    is_seqio_available,
+    is_sklearn_available,
+    is_soundfile_available,
+    is_spacy_available,
+    is_speech_available,
+    is_tensor,
+    is_timm_available,
+    is_tokenizers_available,
+    is_torch_available,
+    is_torch_cuda_available,
+    is_torch_fx_proxy,
+    is_torch_mps_available,
+    is_torch_tf32_available,
+    is_torch_xla_available,
+    is_torchaudio_available,
+    is_training_run_on_sagemaker,
+    is_vision_available,
+    replace_return_docstrings,
+    requires_backends,
+    to_numpy,
+    to_py_obj,
+    torch_only_method,
+)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/fusion_mapping.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fusion registration helpers.
+See `docs/source/en/fusion_mapping.md` for the design overview and extension guide.
+"""
+import math
+import re
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any
+import torch
+from torch import nn
+from .conversion_mapping import get_checkpoint_conversion_mapping, register_checkpoint_conversion_mapping
+from .core_model_loading import Conv3dToLinear, WeightConverter, WeightRenaming, WeightTransform
+from .monkey_patching import register_patch_mapping
+from .utils import logging
+if TYPE_CHECKING:
+    from .configuration_utils import PretrainedConfig
+    from .modeling_utils import PreTrainedModel
+logger = logging.get_logger(__name__)
+_FUSION_DISCOVERY_CACHE: dict[str, dict[type, dict[str, type[nn.Module]]]] = {}
+class ModuleFusionSpec:
+    """Base recipe for a fusion family.
+    A fusion spec decides which modules are eligible for a fusion, how to build
+    the runtime replacement class, and which weight transforms are needed to map
+    checkpoints between the original and fused layouts.
+    """
+    target_modules_patterns: tuple[str, ...] = ()
+    def get_empty_log(self, model_name: str) -> str:
+        """Return the log message emitted when no compatible modules are found."""
+        return f"No compatible {type(self).__name__} classes found to fuse for {model_name}"
+    def is_fusable(self, module: nn.Module) -> bool:
+        """Return whether `module` is compatible with this fusion family."""
+        raise NotImplementedError
+    def make_fused_class(self, original_cls: type[nn.Module]) -> type[nn.Module]:
+        """Build the runtime replacement class for a compatible module class."""
+        raise NotImplementedError
+    def make_transforms(self, config: "PretrainedConfig") -> list[WeightTransform]:
+        """Build the weight transforms needed to load and save the fused runtime layout."""
+        raise NotImplementedError
+class _FusedPatchEmbeddingMixin:
+    def __init__(self, *args, **kwargs):
+        # call the original_cls.__init__()
+        super().__init__(*args, **kwargs)
+        self.patch_volume = self.proj.in_channels * math.prod(self.proj.kernel_size)
+        self.linear_proj = nn.Linear(
+            self.patch_volume,
+            self.proj.out_channels,
+            bias=self.proj.bias is not None,
+            device=self.proj.weight.device,
+            dtype=self.proj.weight.dtype,
+        )
+        del self.proj
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.linear_proj.weight.dtype
+        hidden_states = hidden_states.view(-1, self.patch_volume)
+        hidden_states = self.linear_proj(hidden_states.to(dtype=target_dtype))
+        return hidden_states.view(-1, self.embed_dim)
+class PatchEmbeddingsFusionSpec(ModuleFusionSpec):
+    """Fuse compatible Conv3d patch embeddings into flattened Linear projections."""
+    target_modules_patterns = (r"(^|\.)patch_embed$",)
+    def is_fusable(self, module: nn.Module) -> bool:
+        if not isinstance(proj := getattr(module, "proj", None), nn.Conv3d):
+            return False
+        # no overlap between the patches
+        return (
+            proj.stride == proj.kernel_size
+            and proj.padding == (0, 0, 0)
+            and proj.dilation == (1, 1, 1)
+            and proj.groups == 1
+        )
+    def make_fused_class(self, original_cls: type[nn.Module]) -> type[nn.Module]:
+        fused_cls = type(f"Fused{original_cls.__name__}", (_FusedPatchEmbeddingMixin, original_cls), {})
+        fused_cls.__qualname__ = f"Fused{original_cls.__qualname__}"
+        return fused_cls
+    def make_transforms(self, config: "PretrainedConfig") -> list[WeightTransform]:
+        vision_config = getattr(config, "vision_config", config)
+        patch_size = vision_config.patch_size
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        kernel_size = (vision_config.temporal_patch_size, *tuple(patch_size))
+        in_channels = vision_config.in_channels
+        return [
+            WeightConverter(
+                source_patterns=r"patch_embed\.proj\.weight$",
+                target_patterns=r"patch_embed\.linear_proj\.weight$",
+                operations=[
+                    Conv3dToLinear(
+                        in_channels=in_channels,
+                        kernel_size=kernel_size,
+                    )
+                ],
+            ),
+            WeightRenaming(
+                source_patterns=r"patch_embed\.proj\.bias$",
+                target_patterns=r"patch_embed\.linear_proj\.bias$",
+            ),
+        ]
+def _discover_fusable_modules(
+    cls: "type[PreTrainedModel]",
+    config: "PretrainedConfig",
+    fusion_name: str,
+    spec: ModuleFusionSpec,
+) -> dict[str, type[nn.Module]]:
+    """Discover compatible module classes for one fusion family on a meta-initialized model.
+    This function:
+    - instantiates `cls(config)` on the meta device
+    - scans `named_modules()` for candidate modules
+    - optionally pre-filters them with `target_modules_patterns`
+    - uses `is_fusable(...)` as the final structural check
+    - builds the class-level patch mapping used by monkey patching
+    Results are cached per `(fusion_name, cls)` to avoid repeated meta-initialization.
+    This matches the current class-level fusion behavior, where one compatible
+    module class maps to one fused replacement class.
+    """
+    cache = _FUSION_DISCOVERY_CACHE.setdefault(fusion_name, {})
+    if cls in cache:
+        return cache[cls]
+    with torch.device("meta"):
+        model = cls(config)
+    seen_classes = set()
+    patch_mapping = {}
+    target_module_pattern = (
+        re.compile("|".join(spec.target_modules_patterns)) if spec.target_modules_patterns else None
+    )
+    for module_name, module in model.named_modules():
+        module_cls = type(module)
+        if module_cls in seen_classes:
+            continue
+        if target_module_pattern is not None and target_module_pattern.search(module_name) is None:
+            continue
+        if not spec.is_fusable(module):
+            continue
+        seen_classes.add(module_cls)
+        patch_mapping[module_cls.__name__] = spec.make_fused_class(module_cls)
+    cache[cls] = patch_mapping
+    return patch_mapping
+def _register_module_fusion(
+    cls: "type[PreTrainedModel]", config: "PretrainedConfig", fusion_name: str, spec: ModuleFusionSpec
+) -> None:
+    """Register one fusion family for `cls`.
+    This function updates the two global registries used by fused loading:
+    - the monkey-patching registry, so compatible module classes are replaced before initialization
+    - the checkpoint conversion mapping, so fused runtime modules still load from the original checkpoint layout
+    Notes:
+    - conflicting checkpoint transforms fail fast
+    """
+    fusable_classes = _discover_fusable_modules(cls, config, fusion_name=fusion_name, spec=spec)
+    if not fusable_classes:
+        logger.info(spec.get_empty_log(cls.__name__))
+        return
+    register_patch_mapping(fusable_classes, overwrite=True)
+    if not hasattr(cls, "config_class") or not hasattr(cls.config_class, "model_type"):
+        raise ValueError(f"Model {cls.__name__} has no config class or model type")
+    model_type = cls.config_class.model_type
+    converters = spec.make_transforms(config)
+    existing_converters = get_checkpoint_conversion_mapping(model_type)
+    if existing_converters is not None:
+        # WeightConverter matching stops at the first matching source pattern, so
+        # conflicting converters must fail fast instead of being appended.
+        existing_converter_sources = {tuple(existing.source_patterns): existing for existing in existing_converters}
+        for converter in converters:
+            source_patterns = tuple(converter.source_patterns)
+            existing_converter = existing_converter_sources.get(source_patterns)
+            if existing_converter is not None:
+                raise ValueError(
+                    f"Fusion {fusion_name} for model type {model_type} conflicts with an existing conversion mapping "
+                    f"for source patterns {source_patterns}."
+                )
+        # TODO: allow compatible fusions mentioned https://github.com/huggingface/transformers/pull/45041#discussion_r3028989716
+        converters = existing_converters + converters
+    register_checkpoint_conversion_mapping(model_type, converters, overwrite=True)
+_FUSION_REGISTRY: dict[str, ModuleFusionSpec] = {"patch_embeddings": PatchEmbeddingsFusionSpec()}
+def _iter_enabled_fusions(fusion_config: Mapping[str, bool | Mapping[str, Any]]) -> list[str]:
+    """Validate `fusion_config` and return enabled fusion names in user-specified order."""
+    enabled_fusions = []
+    for fusion_name, fusion_options in fusion_config.items():
+        if fusion_name not in _FUSION_REGISTRY:
+            raise ValueError(f"Unknown fusion type: {fusion_name}")
+        if fusion_options is False:
+            continue
+        if fusion_options is not True and not isinstance(fusion_options, Mapping):
+            raise ValueError(
+                f"Invalid fusion config for {fusion_name}: expected `True`, `False`, or a mapping of options."
+            )
+        enabled_fusions.append(fusion_name)
+    return enabled_fusions
+def register_fusion_patches(
+    cls: "type[PreTrainedModel]", config, fusion_config: Mapping[str, bool | Mapping[str, Any]] | None = None
+) -> None:
+    """Register requested runtime fusions for `cls`.
+    This function:
+    - validates `fusion_config` against `_FUSION_REGISTRY`
+    - resolves the enabled fusion families in user order
+    - registers monkey patches and checkpoint transforms before model instantiation
+    """
+    if not fusion_config:
+        return
+    for fusion_name in _iter_enabled_fusions(fusion_config):
+        _register_module_fusion(cls, config, fusion_name, _FUSION_REGISTRY[fusion_name])

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_processing_backends.py ADDED Viewed

	@@ -0,0 +1,689 @@

+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Iterable
+from functools import lru_cache
+from typing import Any, Optional, Union
+import numpy as np
+from .image_processing_base import BatchFeature
+from .image_processing_utils import BaseImageProcessor
+from .image_transforms import (
+    center_crop as np_center_crop,
+)
+from .image_transforms import (
+    convert_to_rgb,
+    divide_to_patches,  # noqa: F401 - re-exported for backward compat with image_processing_utils_fast
+    get_resize_output_image_size,
+    get_size_with_aspect_ratio,
+    group_images_by_shape,
+    reorder_images,
+)
+from .image_transforms import (
+    normalize as np_normalize,
+)
+from .image_transforms import (
+    rescale as np_rescale,
+)
+from .image_transforms import (
+    resize as np_resize,
+)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    SizeDict,
+    get_image_size,
+    get_image_size_for_max_height_width,
+    get_image_type,
+    get_max_height_width,
+    infer_channel_dimension_format,
+    is_valid_image,
+    load_image_as_tensor,
+)
+from .processing_utils import ImagesKwargs, Unpack
+from .utils import (
+    TensorType,
+    is_torch_available,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
+from .utils.import_utils import is_rocm_platform, is_torchdynamo_compiling, requires
+if is_vision_available():
+    from .image_utils import PILImageResampling
+if is_torch_available():
+    import torch
+if is_torchvision_available():
+    from torchvision.transforms.v2 import functional as tvF
+    from .image_utils import pil_torch_interpolation_mapping, torch_pil_interpolation_mapping
+else:
+    pil_torch_interpolation_mapping = None
+    torch_pil_interpolation_mapping = None
+logger = logging.get_logger(__name__)
+@requires(backends=("torch", "torchvision"))
+class TorchvisionBackend(BaseImageProcessor):
+    """Torchvision backend for GPU-accelerated batched image processing."""
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
+        super().__init__(**kwargs)
+        self._set_attributes(**kwargs)
+    @property
+    def is_fast(self) -> bool:
+        """
+        `bool`: Whether or not this image processor is using the fast (Torchvision) backend.
+        The `is_fast` property is deprecated and will be removed in v5.3 of Transformers.
+        Use the `backend` attribute instead (e.g., `processor.backend == "torchvision"`).
+        """
+        logger.warning_once(
+            "The `is_fast` property is deprecated and will be removed in v5.3 of Transformers. "
+            "Use the `backend` attribute instead (e.g., `processor.backend == 'torchvision'`)."
+        )
+        return True
+    @property
+    def backend(self) -> str:
+        """
+        `str`: The backend used by this image processor.
+        """
+        return "torchvision"
+    def fetch_images(self, image_url_or_urls: str | list[str] | list[list[str]]):
+        """
+        Convert a single or a list of URLs / paths into `torch.Tensor` objects.
+        Already-valid image objects (tensors, numpy arrays, PIL Images) are passed through
+        unchanged so that callers who pre-load images are unaffected.
+        """
+        if isinstance(image_url_or_urls, (list, tuple)):
+            return [self.fetch_images(x) for x in image_url_or_urls]
+        elif isinstance(image_url_or_urls, str):
+            return load_image_as_tensor(image_url_or_urls)
+        elif is_valid_image(image_url_or_urls):
+            return image_url_or_urls
+        else:
+            raise TypeError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
+    def process_image(
+        self,
+        image: ImageInput,
+        do_convert_rgb: bool | None = None,
+        input_data_format: str | ChannelDimension | None = None,
+        device: Optional["torch.device"] = None,
+        **kwargs: Unpack[ImagesKwargs],
+    ) -> "torch.Tensor":
+        """Process a single image for torchvision backend."""
+        image_type = get_image_type(image)
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+        if do_convert_rgb:
+            image = self.convert_to_rgb(image)
+        if image_type == ImageType.PIL:
+            image = tvF.pil_to_tensor(image)
+        elif image_type == ImageType.NUMPY:
+            image = torch.from_numpy(image).contiguous()
+        if image.ndim == 2:
+            image = image.unsqueeze(0)
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        if input_data_format == ChannelDimension.LAST:
+            image = image.permute(2, 0, 1).contiguous()
+        if device is not None:
+            image = image.to(device)
+        return image
+    def convert_to_rgb(self, image: ImageInput) -> ImageInput:
+        """Convert an image to RGB format."""
+        return convert_to_rgb(image)
+    def pad(
+        self,
+        images: list["torch.Tensor"],
+        pad_size: SizeDict = None,
+        fill_value: int | None = 0,
+        padding_mode: str | None = "constant",
+        return_mask: bool = False,
+        disable_grouping: bool | None = False,
+        is_nested: bool | None = False,
+        **kwargs,
+    ) -> Union[tuple["torch.Tensor", "torch.Tensor"], "torch.Tensor"]:
+        """Pad images using Torchvision with batched operations."""
+        if pad_size is not None:
+            if not (pad_size.height and pad_size.width):
+                raise ValueError(f"Pad size must contain 'height' and 'width' keys only. Got pad_size={pad_size}.")
+            pad_size = (pad_size.height, pad_size.width)
+        else:
+            pad_size = get_max_height_width(images)
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, disable_grouping=disable_grouping, is_nested=is_nested
+        )
+        processed_images_grouped = {}
+        processed_masks_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            image_size = stacked_images.shape[-2:]
+            padding_height = pad_size[0] - image_size[0]
+            padding_width = pad_size[1] - image_size[1]
+            if padding_height < 0 or padding_width < 0:
+                raise ValueError(
+                    f"Padding dimensions are negative. Please make sure that the `pad_size` is larger than the "
+                    f"image size. Got pad_size={pad_size}, image_size={image_size}."
+                )
+            if image_size != pad_size:
+                padding = (0, 0, padding_width, padding_height)
+                stacked_images = tvF.pad(stacked_images, padding, fill=fill_value, padding_mode=padding_mode)
+            processed_images_grouped[shape] = stacked_images
+            if return_mask:
+                stacked_masks = torch.zeros_like(stacked_images, dtype=torch.int64)[..., 0, :, :]
+                stacked_masks[..., : image_size[0], : image_size[1]] = 1
+                processed_masks_grouped[shape] = stacked_masks
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=is_nested)
+        if return_mask:
+            processed_masks = reorder_images(processed_masks_grouped, grouped_images_index, is_nested=is_nested)
+            return processed_images, processed_masks
+        return processed_images
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """Resize an image using Torchvision."""
+        # Convert PIL resample to torchvision interpolation if needed
+        if resample is not None:
+            if isinstance(resample, (PILImageResampling, int)):
+                interpolation = pil_torch_interpolation_mapping[resample]
+            else:
+                interpolation = resample
+        else:
+            interpolation = tvF.InterpolationMode.BILINEAR
+        if interpolation == tvF.InterpolationMode.LANCZOS:
+            logger.warning_once(
+                "You have used a torchvision backend image processor with LANCZOS resample which not yet supported for torch.Tensor. "
+                "BICUBIC resample will be used as an alternative. Please fall back to a pil backend image processor if you "
+                "want full consistency with the original model."
+            )
+            interpolation = tvF.InterpolationMode.BICUBIC
+        if size.shortest_edge and size.longest_edge:
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size.shortest_edge,
+                size.longest_edge,
+            )
+        elif size.shortest_edge:
+            new_size = get_resize_output_image_size(
+                image,
+                size=size.shortest_edge,
+                default_to_square=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
+                f" {size}."
+            )
+        # Workaround for torch.compile issue with uint8 on AMD GPUs
+        if is_torchdynamo_compiling() and is_rocm_platform():
+            return self._compile_friendly_resize(image, new_size, interpolation, antialias)
+        return tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+    @staticmethod
+    def _compile_friendly_resize(
+        image: "torch.Tensor",
+        new_size: tuple[int, int],
+        interpolation: Optional["tvF.InterpolationMode"] = None,
+        antialias: bool = True,
+    ) -> "torch.Tensor":
+        """A wrapper around tvF.resize for torch.compile compatibility with uint8 tensors."""
+        if image.dtype == torch.uint8:
+            image = image.float() / 256
+            image = tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+            image = image * 256
+            image = torch.where(image > 255, 255, image)
+            image = torch.where(image < 0, 0, image)
+            image = image.round().to(torch.uint8)
+        else:
+            image = tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+        return image
+    def rescale(
+        self,
+        image: "torch.Tensor",
+        scale: float,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """Rescale an image by a scale factor using Torchvision."""
+        return image * scale
+    def normalize(
+        self,
+        image: "torch.Tensor",
+        mean: float | Iterable[float],
+        std: float | Iterable[float],
+        **kwargs,
+    ) -> "torch.Tensor":
+        """Normalize an image using Torchvision."""
+        return tvF.normalize(image, mean, std)
+    @lru_cache(maxsize=10)
+    def _fuse_mean_std_and_rescale_factor(
+        self,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        device: Optional["torch.device"] = None,
+    ) -> tuple:
+        if do_rescale and do_normalize:
+            # Fused rescale and normalize
+            image_mean = torch.tensor(image_mean, device=device) * (1.0 / rescale_factor)
+            image_std = torch.tensor(image_std, device=device) * (1.0 / rescale_factor)
+            do_rescale = False
+        return image_mean, image_std, do_rescale
+    def rescale_and_normalize(
+        self,
+        images: "torch.Tensor",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: float | list[float],
+        image_std: float | list[float],
+    ) -> "torch.Tensor":
+        """Rescale and normalize images using Torchvision (fused for efficiency)."""
+        image_mean, image_std, do_rescale = self._fuse_mean_std_and_rescale_factor(
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            device=images.device,
+        )
+        if do_normalize:
+            images = self.normalize(images.to(dtype=torch.float32), image_mean, image_std)
+        elif do_rescale:
+            images = self.rescale(images, rescale_factor)
+        return images
+    def center_crop(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """Center crop an image using Torchvision."""
+        if size.height is None or size.width is None:
+            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+        image_height, image_width = image.shape[-2:]
+        crop_height, crop_width = size.height, size.width
+        if crop_width > image_width or crop_height > image_height:
+            padding_ltrb = [
+                (crop_width - image_width) // 2 if crop_width > image_width else 0,
+                (crop_height - image_height) // 2 if crop_height > image_height else 0,
+                (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+                (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+            ]
+            image = tvF.pad(image, padding_ltrb, fill=0)
+            image_height, image_width = image.shape[-2:]
+            if crop_width == image_width and crop_height == image_height:
+                return image
+        crop_top = int((image_height - crop_height) / 2.0)
+        crop_left = int((image_width - crop_width) / 2.0)
+        return tvF.crop(image, crop_top, crop_left, crop_height, crop_width)
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        do_pad: bool | None,
+        pad_size: SizeDict | None,
+        disable_grouping: bool | None,
+        return_tensors: str | TensorType | None,
+        **kwargs,
+    ) -> BatchFeature:
+        """Preprocess using Torchvision backend (fast, GPU-accelerated)."""
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, resample=resample)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+        # Group images by size for further processing
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        if do_pad:
+            processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+@requires(backends=("vision",))
+class PilBackend(BaseImageProcessor):
+    """PIL/NumPy backend for portable CPU-only image processing."""
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
+        super().__init__(**kwargs)
+        self._set_attributes(**kwargs)
+    @property
+    def is_fast(self) -> bool:
+        """
+        `bool`: Whether or not this image processor is using the fast (Torchvision) backend.
+        The `is_fast` property is deprecated and will be removed in v5.3 of Transformers.
+        Use the `backend` attribute instead (e.g., `processor.backend == "torchvision"`).
+        """
+        logger.warning_once(
+            "The `is_fast` property is deprecated and will be removed in v5.3 of Transformers. "
+            "Use the `backend` attribute instead (e.g., `processor.backend == 'torchvision'`)."
+        )
+        return False
+    @property
+    def backend(self) -> str:
+        """
+        `str`: The backend used by this image processor.
+        """
+        return "pil"
+    def process_image(
+        self,
+        image: ImageInput,
+        do_convert_rgb: bool | None = None,
+        input_data_format: str | ChannelDimension | None = None,
+        **kwargs: Unpack[ImagesKwargs],
+    ) -> np.ndarray:
+        """Process a single image for PIL backend."""
+        image_type = get_image_type(image)
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+        if do_convert_rgb:
+            image = self.convert_to_rgb(image)
+        if image_type == ImageType.PIL:
+            image = np.array(image)
+            # Set LAST only for multi-channel PIL images (H, W, C); for grayscale (H, W), leave as is to avoid shape errors after expand_dims.
+            if image.ndim >= 3:
+                input_data_format = ChannelDimension.LAST if input_data_format is None else input_data_format
+        elif image_type == ImageType.TORCH:
+            image = image.numpy()
+        if image.ndim == 2:
+            image = np.expand_dims(image, axis=0)
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        if input_data_format == ChannelDimension.LAST:
+            # Convert from channels-last to channels-first
+            if isinstance(image, np.ndarray):
+                image = np.transpose(image, (2, 0, 1))
+        return image
+    def convert_to_rgb(self, image: ImageInput) -> ImageInput:
+        """Convert an image to RGB format."""
+        return convert_to_rgb(image)
+    def pad(
+        self,
+        images: list[np.ndarray],
+        pad_size: SizeDict = None,
+        fill_value: int | None = 0,
+        padding_mode: str | None = "constant",
+        return_mask: bool = False,
+        **kwargs,
+    ) -> tuple[list[np.ndarray], list[np.ndarray]] | list[np.ndarray]:
+        """Pad images to specified size using NumPy."""
+        if pad_size is not None:
+            if not (pad_size.height and pad_size.width):
+                raise ValueError(f"Pad size must contain 'height' and 'width' keys only. Got pad_size={pad_size}.")
+            target_height, target_width = pad_size.height, pad_size.width
+        else:
+            target_height, target_width = get_max_height_width(images)
+        processed_images = []
+        processed_masks = []
+        for image in images:
+            height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+            padding_height = target_height - height
+            padding_width = target_width - width
+            if padding_height < 0 or padding_width < 0:
+                raise ValueError(
+                    f"Padding dimensions are negative. Please make sure that the `pad_size` is larger than the "
+                    f"image size. Got pad_size=({target_height}, {target_width}), image_size=({height}, {width})."
+                )
+            if height != target_height or width != target_width:
+                # Pad format: ((before_1, after_1), (before_2, after_2), ...)
+                # For CHW format: ((0, 0), (0, padding_height), (0, padding_width))
+                pad_width = ((0, 0), (0, padding_height), (0, padding_width))
+                if padding_mode == "constant":
+                    image = np.pad(image, pad_width, mode="constant", constant_values=fill_value)
+                else:
+                    image = np.pad(image, pad_width, mode=padding_mode)
+            processed_images.append(image)
+            if return_mask:
+                mask = np.zeros((target_height, target_width), dtype=np.int64)
+                mask[:height, :width] = 1
+                processed_masks.append(mask)
+        if return_mask:
+            return processed_images, processed_masks
+        return processed_images
+    def resize(
+        self,
+        image: np.ndarray,
+        size: SizeDict,
+        resample: "PILImageResampling | None" = None,
+        reducing_gap: int | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """Resize an image using PIL/NumPy."""
+        # PIL backend only supports PILImageResampling
+        if resample is not None and not isinstance(resample, (PILImageResampling, int)):
+            if torch_pil_interpolation_mapping is not None and resample in torch_pil_interpolation_mapping:
+                resample = torch_pil_interpolation_mapping[resample]
+            else:
+                resample = PILImageResampling.BILINEAR
+        resample = resample if resample is not None else PILImageResampling.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+            new_size = get_size_with_aspect_ratio(
+                (height, width),
+                size.shortest_edge,
+                size.longest_edge,
+            )
+        elif size.shortest_edge:
+            new_size = get_resize_output_image_size(
+                image,
+                size=size.shortest_edge,
+                default_to_square=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+        elif size.max_height and size.max_width:
+            height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+            new_size = get_image_size_for_max_height_width((height, width), size.max_height, size.max_width)
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
+                f" {size}."
+            )
+        return np_resize(
+            image,
+            size=new_size,
+            resample=resample,
+            reducing_gap=reducing_gap,
+            data_format=ChannelDimension.FIRST,
+            input_data_format=ChannelDimension.FIRST,
+        )
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: float,
+        **kwargs,
+    ) -> np.ndarray:
+        """Rescale an image by a scale factor using NumPy."""
+        return np_rescale(
+            image,
+            scale=scale,
+            data_format=ChannelDimension.FIRST,
+            input_data_format=ChannelDimension.FIRST,
+        )
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: float | Iterable[float],
+        std: float | Iterable[float],
+        **kwargs,
+    ) -> np.ndarray:
+        """Normalize an image using NumPy."""
+        return np_normalize(
+            image,
+            mean=mean,
+            std=std,
+            data_format=ChannelDimension.FIRST,
+            input_data_format=ChannelDimension.FIRST,
+        )
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: SizeDict,
+        **kwargs,
+    ) -> np.ndarray:
+        """Center crop an image using NumPy."""
+        if size.height is None or size.width is None:
+            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+        return np_center_crop(
+            image,
+            size=(size.height, size.width),
+            data_format=ChannelDimension.FIRST,
+            input_data_format=ChannelDimension.FIRST,
+        )
+    def _preprocess(
+        self,
+        images: list[np.ndarray],
+        do_resize: bool,
+        size: SizeDict,
+        resample: "PILImageResampling | None",
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        do_pad: bool | None,
+        pad_size: SizeDict | None,
+        return_tensors: str | TensorType | None,
+        **kwargs,
+    ) -> BatchFeature:
+        """Preprocess using PIL backend (portable, CPU-only)."""
+        processed_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample)
+            if do_center_crop:
+                image = self.center_crop(image, crop_size)
+            if do_rescale:
+                image = self.rescale(image, rescale_factor)
+            if do_normalize:
+                image = self.normalize(image, image_mean, image_std)
+            processed_images.append(image)
+        if do_pad:
+            processed_images = self.pad(processed_images, pad_size=pad_size)
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+    def to_dict(self) -> dict[str, Any]:
+        processor_dict = super().to_dict()
+        # Remove the "Pil" suffix from the image processor type
+        if processor_dict.get("image_processor_type", "").endswith("Pil"):
+            processor_dict["image_processor_type"] = processor_dict["image_processor_type"][:-3]
+        return processor_dict
+# Backward-compatible alias: allow referring to TorchvisionBackend as BaseImageProcessorFast
+BaseImageProcessorFast = TorchvisionBackend

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_processing_utils.py ADDED Viewed

	@@ -0,0 +1,688 @@

+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Iterable
+from copy import deepcopy
+from functools import partial
+from typing import Any
+import numpy as np
+from huggingface_hub.dataclasses import validate_typed_dict
+from .image_processing_base import BatchFeature, ImageProcessingMixin
+from .image_transforms import center_crop, normalize, rescale
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    SizeDict,
+    get_image_size,
+    make_flat_list_of_images,
+    validate_preprocess_arguments,
+)
+from .processing_utils import ImagesKwargs, Unpack
+from .utils import (
+    auto_docstring,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
+if is_vision_available():
+    from .image_utils import PILImageResampling
+if is_torchvision_available():
+    from torchvision.transforms.v2 import functional as tvF
+logger = logging.get_logger(__name__)
+INIT_SERVICE_KWARGS = [
+    "processor_class",
+    "image_processor_type",
+]
+class BaseImageProcessor(ImageProcessingMixin):
+    r"""
+    Base class for image processors with an inheritance-based backend architecture.
+    This class defines the preprocessing pipeline: kwargs validation, input preparation, and dispatching to the
+    backend's `_preprocess` method. Backend subclasses (`TorchvisionBackend`, `PilBackend`) inherit from this class
+    and implement the actual image operations (resize, crop, rescale, normalize, etc.). Model-specific image
+    processors then inherit from the appropriate backend class.
+    Architecture Overview
+    ---------------------
+    The class hierarchy is:
+        BaseImageProcessor (this class)
+        ├── TorchvisionBackend    (GPU-accelerated, torch.Tensor)
+        │   └── ModelImageProcessor (e.g. LlavaNextImageProcessor)
+        └── PilBackend            (portable CPU, np.ndarray)
+            └── ModelImageProcessorPil (e.g. CLIPImageProcessorPil)
+    The preprocessing flow is:
+        __call__() → preprocess() → _preprocess_image_like_inputs() → _prepare_image_like_inputs()
+                                                                       (calls process_image per image)
+                                                                     → _preprocess()
+                                                                       (batch operations: resize, crop, etc.)
+    - `process_image`: Implemented by backends. Converts a single raw input (PIL, NumPy, or Tensor) to the
+      backend's working format (torch.Tensor or np.ndarray), handles RGB conversion and channel reordering.
+    - `_preprocess`: Implemented by backends. Performs the actual batch processing (resize, center crop, rescale,
+      normalize, pad) and returns a `BatchFeature`.
+    Basic Implementation
+    --------------------
+    For processors that only need standard operations (resize, center crop, rescale, normalize), inherit from
+    a backend and define class attributes:
+        from transformers.image_processing_backends import PilBackend
+        class MyImageProcessorPil(PilBackend):
+            resample = PILImageResampling.BILINEAR
+            image_mean = IMAGENET_DEFAULT_MEAN
+            image_std = IMAGENET_DEFAULT_STD
+            size = {"height": 224, "width": 224}
+            do_resize = True
+            do_rescale = True
+            do_normalize = True
+    The backend's `_preprocess` method handles the standard pipeline automatically.
+    Custom Processing
+    -----------------
+    For processors that need custom logic (e.g., patch-based processing, multiple input types), override
+    `_preprocess` in your model-specific processor. The `_preprocess` method receives already-prepared images
+    (converted to the backend format with channels-first ordering) and performs the actual processing:
+        class MyImageProcessor(TorchvisionBackend):
+            def _preprocess(self, images, do_resize, size, do_normalize, image_mean, image_std, **kwargs):
+                # Group images by shape for efficient batched operations
+                grouped_images, grouped_images_index = group_images_by_shape(images)
+                processed_groups = {}
+                for shape, stacked_images in grouped_images.items():
+                    if do_resize:
+                        stacked_images = self.resize(stacked_images, size=size)
+                    if do_normalize:
+                        stacked_images = self.normalize(stacked_images, mean=image_mean, std=image_std)
+                    processed_groups[shape] = stacked_images
+                processed_images = reorder_images(processed_groups, grouped_images_index)
+                return BatchFeature(data={"pixel_values": processed_images})
+    For processors handling multiple input types (e.g., images + segmentation maps), override
+    `_preprocess_image_like_inputs`:
+        def _preprocess_image_like_inputs(
+            self,
+            images: ImageInput,
+            segmentation_maps: ImageInput | None = None,
+            **kwargs,
+        ) -> BatchFeature:
+            images = self._prepare_image_like_inputs(images, **kwargs)
+            batch_feature = self._preprocess(images, **kwargs)
+            if segmentation_maps is not None:
+                maps = self._prepare_image_like_inputs(segmentation_maps, **kwargs)
+                batch_feature["labels"] = self._preprocess(maps, **kwargs).pixel_values
+            return batch_feature
+    Extending Backend Behavior
+    --------------------------
+    To customize operations for a specific backend, subclass the backend and override its methods:
+        from transformers.image_processing_backends import TorchvisionBackend, PilBackend
+        class MyTorchvisionProcessor(TorchvisionBackend):
+            def resize(self, image, size, **kwargs):
+                # Custom resize logic for torchvision
+                return super().resize(image, size, **kwargs)
+        class MyPilProcessor(PilBackend):
+            def resize(self, image, size, **kwargs):
+                # Custom resize logic for PIL
+                return super().resize(image, size, **kwargs)
+    Custom Parameters
+    -----------------
+    To add parameters beyond `ImagesKwargs`, create a custom kwargs class and set it as `valid_kwargs`:
+        class MyImageProcessorKwargs(ImagesKwargs):
+            custom_param: int | None = None
+        class MyImageProcessor(TorchvisionBackend):
+            valid_kwargs = MyImageProcessorKwargs
+            custom_param = 10  # default value
+    Key Notes
+    ---------
+    - Backend selection is done at the class level: inherit from `TorchvisionBackend` or `PilBackend`
+    - Backends receive images as `torch.Tensor` (Torchvision) or `np.ndarray` (PIL), always channels-first
+    - All images have channel dimension first during processing, regardless of backend
+    - Arguments not provided by users default to class attribute values
+    - Backend classes encapsulate backend-specific logic (resize, normalize, etc.) and can be overridden
+    """
+    valid_kwargs = ImagesKwargs
+    default_to_square = True
+    rescale_factor = 1 / 255
+    model_input_names = ["pixel_values"]
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
+        super().__init__(**kwargs)
+        # We don't call self._set_attributes in BaseImageProcessor for backward compatibility with remote code
+        # We call it instead in the backend subclasses' __init__ methods.
+    def _set_attributes(self, **kwargs):
+        """Resolve and set instance attributes from kwargs and class-level defaults for all valid kwargs."""
+        attributes = {}
+        for key in self.valid_kwargs.__annotations__:
+            kwarg = kwargs.pop(key, None)
+            if kwarg is not None:
+                attributes[key] = kwarg
+            else:
+                attributes[key] = deepcopy(getattr(self, key, None))
+        attributes = self._standardize_kwargs(**attributes)
+        for key, value in attributes.items():
+            setattr(self, key, value)
+        self._valid_kwargs_names = list(self.valid_kwargs.__annotations__.keys())
+    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
+        """Preprocess an image or a batch of images."""
+        return self.preprocess(images, *args, **kwargs)
+    def process_image(self, *args, **kwargs):
+        """
+        Process a single raw image into the backend's working format.
+        Implemented by backend subclasses (`TorchvisionBackend`, `PilBackend`). Converts a raw input
+        (PIL Image, NumPy array, or torch Tensor) to the backend's internal format (`torch.Tensor` for
+        Torchvision, `np.ndarray` for PIL), handles RGB conversion and ensures channels-first ordering.
+        """
+        raise NotImplementedError
+    def _preprocess(self, *args, **kwargs):
+        """
+        Perform the actual batch image preprocessing (resize, center crop, rescale, normalize, pad).
+        Implemented by backend subclasses (`TorchvisionBackend`, `PilBackend`). Receives a list of
+        already-prepared images (in the backend's format, channels-first) and applies the configured
+        preprocessing operations. Returns a `BatchFeature` with the processed pixel values.
+        Model-specific processors can override this method to implement custom preprocessing logic
+        (e.g., patch-based processing in LLaVA-NeXT).
+        """
+        raise NotImplementedError
+    def _prepare_images_structure(
+        self,
+        images: ImageInput,
+        expected_ndims: int = 3,
+    ) -> ImageInput:
+        """
+        Prepare the images structure for processing.
+        Args:
+            images (`ImageInput`):
+                The input images to process.
+        Returns:
+            `ImageInput`: The images with a valid nesting.
+        """
+        images = self.fetch_images(images)
+        return make_flat_list_of_images(images, expected_ndims=expected_ndims)
+    def _prepare_image_like_inputs(
+        self,
+        images: ImageInput,
+        *args,
+        expected_ndims: int = 3,
+        **kwargs: Unpack[ImagesKwargs],
+    ) -> list[Any]:
+        """
+        Prepare image-like inputs for processing by converting each image via `process_image`.
+        Flattens the input structure and applies `process_image` (implemented by the backend) to each
+        individual image, converting raw inputs (PIL, NumPy, Tensor) into the backend's working format
+        with channels-first ordering.
+        Args:
+            images (`ImageInput`):
+                The image-like inputs to process.
+            expected_ndims (`int`, *optional*, defaults to 3):
+                The expected number of dimensions for the images.
+        Returns:
+            `list[torch.Tensor]` or `list[np.ndarray]`: The prepared images in the backend's format,
+            with channels-first ordering.
+        """
+        images = self._prepare_images_structure(images, expected_ndims=expected_ndims)
+        process_image_partial = partial(self.process_image, *args, **kwargs)
+        has_nested_structure = len(images) > 0 and isinstance(images[0], list | tuple)
+        if has_nested_structure:
+            processed_images = [[process_image_partial(img) for img in nested_list] for nested_list in images]
+        else:
+            processed_images = [process_image_partial(img) for img in images]
+        return processed_images
+    def _preprocess_image_like_inputs(
+        self,
+        images: ImageInput,
+        *args,
+        **kwargs: Unpack[ImagesKwargs],
+    ) -> BatchFeature:
+        """
+        Preprocess image-like inputs by preparing them and dispatching to `_preprocess`.
+        This method first calls `_prepare_image_like_inputs` to convert raw inputs into the backend's
+        format, then calls `_preprocess` for the actual batch processing. Override this method in
+        model-specific processors that need to handle multiple image-like input types (e.g., images
+        and segmentation maps) or need custom orchestration of the preprocessing pipeline.
+        """
+        images = self._prepare_image_like_inputs(images, **kwargs)
+        return self._preprocess(images, *args, **kwargs)
+    def _standardize_kwargs(
+        self,
+        size: int | Iterable[int] | dict[str, int] | SizeDict | None = None,
+        crop_size: int | Iterable[int] | dict[str, int] | SizeDict | None = None,
+        pad_size: int | Iterable[int] | dict[str, int] | SizeDict | None = None,
+        default_to_square: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Standardize kwargs to canonical format before validation.
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None and not isinstance(size, SizeDict):
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if crop_size is not None and not isinstance(crop_size, SizeDict):
+            crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size"))
+        if pad_size is not None and not isinstance(pad_size, SizeDict):
+            pad_size = SizeDict(**get_size_dict(size=pad_size, param_name="pad_size"))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        kwargs["size"] = size
+        kwargs["crop_size"] = crop_size
+        kwargs["pad_size"] = pad_size
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        return kwargs
+    # Backwards compatibility for method that was renamed
+    _further_process_kwargs = _standardize_kwargs
+    def _validate_preprocess_kwargs(
+        self,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | tuple[float] | None = None,
+        image_std: float | tuple[float] | None = None,
+        do_resize: bool | None = None,
+        size: SizeDict | None = None,
+        do_center_crop: bool | None = None,
+        crop_size: SizeDict | None = None,
+        resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
+        **kwargs,
+    ):
+        """
+        Validate the kwargs for the preprocess method.
+        """
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+    @auto_docstring
+    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images.
+        """
+        # Perform type validation on received kwargs
+        validate_typed_dict(self.valid_kwargs, kwargs)
+        # Set default kwargs from self
+        for kwarg_name in self._valid_kwargs_names:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+        # Update kwargs that need further processing before being validated
+        kwargs = self._standardize_kwargs(**kwargs)
+        # Validate kwargs
+        self._validate_preprocess_kwargs(**kwargs)
+        return self._preprocess_image_like_inputs(images, *args, **kwargs)
+    def to_dict(self) -> dict[str, Any]:
+        processor_dict = super().to_dict()
+        # Filter out None values that are class defaults
+        filtered_dict = {}
+        for key, value in processor_dict.items():
+            if isinstance(value, SizeDict):
+                value = dict(value)
+            if value is None:
+                class_default = getattr(type(self), key, "NOT_FOUND")
+                # Keep None if user explicitly set it (class default is non-None)
+                if class_default != "NOT_FOUND" and class_default is not None:
+                    filtered_dict[key] = value
+            else:
+                filtered_dict[key] = value
+        filtered_dict.pop("_valid_processor_keys", None)
+        filtered_dict.pop("_valid_kwargs_names", None)
+        return filtered_dict
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: float,
+        data_format: str | ChannelDimension | None = None,
+        input_data_format: str | ChannelDimension | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs)
+    # The next methods are kept for backwards compatibility with remote code, but are overriden by backends.
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: float | Iterable[float],
+        std: float | Iterable[float],
+        data_format: str | ChannelDimension | None = None,
+        input_data_format: str | ChannelDimension | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        return normalize(
+            image, mean=mean, std=std, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        data_format: str | ChannelDimension | None = None,
+        input_data_format: str | ChannelDimension | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`dict[str, int]`):
+                Size of the output image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+        return center_crop(
+            image,
+            size=(size["height"], size["width"]),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+VALID_SIZE_DICT_KEYS = (
+    {"height", "width"},
+    {"shortest_edge"},
+    {"shortest_edge", "longest_edge"},
+    {"longest_edge"},
+    {"max_height", "max_width"},
+)
+def is_valid_size_dict(size_dict):
+    if not isinstance(size_dict, dict):
+        return False
+    size_dict_keys = set(size_dict.keys())
+    for allowed_keys in VALID_SIZE_DICT_KEYS:
+        if size_dict_keys == allowed_keys:
+            return True
+    return False
+def convert_to_size_dict(
+    size: int | Iterable[int] | None = None,
+    max_size: int | None = None,
+    default_to_square: bool = True,
+    height_width_order: bool = True,
+) -> dict[str, int]:
+    # By default, if size is an int we assume it represents a tuple of (size, size).
+    if isinstance(size, int) and default_to_square:
+        if max_size is not None:
+            raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size")
+        return {"height": size, "width": size}
+    # In other configs, if size is an int and default_to_square is False, size represents the length of
+    # the shortest edge after resizing.
+    elif isinstance(size, int) and not default_to_square:
+        size_dict = {"shortest_edge": size}
+        if max_size is not None:
+            size_dict["longest_edge"] = max_size
+        return size_dict
+    # Otherwise, if size is a tuple it's either (height, width) or (width, height)
+    elif isinstance(size, (tuple, list)) and height_width_order:
+        return {"height": size[0], "width": size[1]}
+    elif isinstance(size, (tuple, list)) and not height_width_order:
+        return {"height": size[1], "width": size[0]}
+    elif size is None and max_size is not None:
+        if default_to_square:
+            raise ValueError("Cannot specify both default_to_square=True and max_size")
+        return {"longest_edge": max_size}
+    raise ValueError(f"Could not convert size input to size dict: {size}")
+def get_size_dict(
+    size: int | Iterable[int] | dict[str, int] | SizeDict | None = None,
+    max_size: int | None = None,
+    height_width_order: bool = True,
+    default_to_square: bool = True,
+    param_name="size",
+) -> dict:
+    """
+    Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
+    compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
+    width) or (width, height) format.
+    - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width":
+    size[0]}` if `height_width_order` is `False`.
+    - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`.
+    - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size`
+      is set, it is added to the dict as `{"longest_edge": max_size}`.
+    - If `size` is `None` and `default_to_square` is False, the result is `{"longest_edge": max_size}` (requires
+      `max_size` to be set). Tuple/list/SizeDict/dict `size` values do not use `max_size`.
+    Args:
+        size (`int | Iterable[int] | dict[str, int] | SizeDict`, *optional*):
+            The `size` parameter to be cast into a size dictionary.
+        max_size (`int | None`, *optional*):
+            With `default_to_square=False`, sets `longest_edge` when `size` is an int or `None`; unused for dict,
+            `SizeDict`, or tuple/list `size`. Raises if set with `default_to_square=True` when `size` is an int or `None`.
+        height_width_order (`bool`, *optional*, defaults to `True`):
+            If `size` is a tuple, whether it's in (height, width) or (width, height) order.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            If `size` is an int, whether to default to a square image or not.
+    """
+    if not isinstance(size, dict | SizeDict):
+        size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order)
+        logger.info(
+            f"{param_name} should be a dictionary with one of the following sets of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
+            f" Converted to {size_dict}.",
+        )
+    # Some remote code bypasses or overrides `_standardize_kwargs`, so handle `SizeDict` `size` here too.
+    elif isinstance(size, SizeDict):
+        size_dict = dict(size)
+    else:
+        size_dict = size
+    if not is_valid_size_dict(size_dict):
+        raise ValueError(
+            f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}"
+        )
+    return size_dict
+def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    This is done by calculating the effective and wasted resolution for each possible resolution.
+    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+    Args:
+        original_size (tuple):
+            The original size of the image in the format (height, width).
+        possible_resolutions (list):
+            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (height, width).
+    """
+    original_height, original_width = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for height, width in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (height, width)
+    return best_fit
+def get_patch_output_size(image, target_resolution, input_data_format):
+    """
+    Given an image and a target resolution, calculate the output size of the image after cropping to the target
+    """
+    original_height, original_width = get_image_size(image, channel_dim=input_data_format)
+    target_height, target_width = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    return new_height, new_width

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_utils.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass, fields
+from io import BytesIO
+from typing import Any, Union
+import httpx
+import numpy as np
+from .utils import (
+    ExplicitEnum,
+    is_numpy_array,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+    to_numpy,
+)
+from .utils.constants import (  # noqa: F401
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+)
+from .utils.import_utils import requires
+if is_vision_available():
+    import PIL.Image
+    import PIL.ImageOps
+    PILImageResampling = PIL.Image.Resampling
+if is_torchvision_available():
+    from torchvision.io import ImageReadMode, decode_image
+    from torchvision.transforms import InterpolationMode
+    from torchvision.transforms.functional import pil_to_tensor
+    pil_torch_interpolation_mapping = {
+        PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT,
+        PILImageResampling.BOX: InterpolationMode.BOX,
+        PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
+        PILImageResampling.HAMMING: InterpolationMode.HAMMING,
+        PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
+        PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
+    }
+    # Create inverse mapping: InterpolationMode -> PILImageResampling
+    torch_pil_interpolation_mapping = {v: k for k, v in pil_torch_interpolation_mapping.items()}
+else:
+    pil_torch_interpolation_mapping = {}
+    torch_pil_interpolation_mapping = {}
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+ImageInput = Union[
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"]
+]
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+AnnotationType = dict[str, int | str | list[dict]]
+def is_pil_image(img):
+    return is_vision_available() and isinstance(img, PIL.Image.Image)
+class ImageType(ExplicitEnum):
+    PIL = "pillow"
+    TORCH = "torch"
+    NUMPY = "numpy"
+def get_image_type(image):
+    if is_pil_image(image):
+        return ImageType.PIL
+    if is_torch_tensor(image):
+        return ImageType.TORCH
+    if is_numpy_array(image):
+        return ImageType.NUMPY
+    raise ValueError(f"Unrecognized image type {type(image)}")
+def is_valid_image(img):
+    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img)
+def is_valid_list_of_images(images: list):
+    return images and all(is_valid_image(image) for image in images)
+def concatenate_list(input_list):
+    if isinstance(input_list[0], list):
+        return [item for sublist in input_list for item in sublist]
+    elif isinstance(input_list[0], np.ndarray):
+        return np.concatenate(input_list, axis=0)
+    elif isinstance(input_list[0], torch.Tensor):
+        return torch.cat(input_list, dim=0)
+def valid_images(imgs):
+    # If we have an list of images, make sure every image is valid
+    if isinstance(imgs, (list, tuple)):
+        for img in imgs:
+            if not valid_images(img):
+                return False
+    # If not a list of tuple, we have been given a single image or batched tensor of images
+    elif not is_valid_image(imgs):
+        return False
+    return True
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+def make_list_of_images(images, expected_ndims: int = 3) -> list[ImageInput]:
+    """
+    Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a batch of images, it is converted to a list of images.
+    Args:
+        images (`ImageInput`):
+            Image of images to turn into a list of images.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            Expected number of dimensions for a single input image. If the input image has a different number of
+            dimensions, an error is raised.
+    """
+    if is_batched(images):
+        return images
+    # Either the input is a single image, in which case we create a list of length 1
+    if is_pil_image(images):
+        # PIL images are never batched
+        return [images]
+    if is_valid_image(images):
+        if images.ndim == expected_ndims + 1:
+            # Batch of images
+            images = list(images)
+        elif images.ndim == expected_ndims:
+            # Single image
+            images = [images]
+        else:
+            raise ValueError(
+                f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
+                f" {images.ndim} dimensions."
+            )
+        return images
+    raise ValueError(
+        f"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, or torch.Tensor, but got {type(images)}."
+    )
+def make_flat_list_of_images(
+    images: list[ImageInput] | ImageInput,
+    expected_ndims: int = 3,
+) -> ImageInput:
+    """
+    Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a nested list of images, it is converted to a flat list of images.
+    Args:
+        images (`Union[list[ImageInput], ImageInput]`):
+            The input image.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            The expected number of dimensions for a single input image.
+    Returns:
+        list: A list of images or a 4d array of images.
+    """
+    # If the input is a nested list of images, we flatten it
+    if (
+        isinstance(images, (list, tuple))
+        and all(isinstance(images_i, (list, tuple)) for images_i in images)
+        and all(is_valid_list_of_images(images_i) or not images_i for images_i in images)
+    ):
+        return [img for img_list in images for img in img_list]
+    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
+        if is_pil_image(images[0]) or images[0].ndim == expected_ndims:
+            return images
+        if images[0].ndim == expected_ndims + 1:
+            return [img for img_list in images for img in img_list]
+    if is_valid_image(images):
+        if is_pil_image(images) or images.ndim == expected_ndims:
+            return [images]
+        if images.ndim == expected_ndims + 1:
+            return list(images)
+    raise ValueError(f"Could not make a flat list of images from {images}")
+def make_nested_list_of_images(
+    images: list[ImageInput] | ImageInput,
+    expected_ndims: int = 3,
+) -> list[ImageInput]:
+    """
+    Ensure that the output is a nested list of images.
+    Args:
+        images (`Union[list[ImageInput], ImageInput]`):
+            The input image.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            The expected number of dimensions for a single input image.
+    Returns:
+        list: A list of list of images or a list of 4d array of images.
+    """
+    # If it's a list of batches, it's already in the right format
+    if (
+        isinstance(images, (list, tuple))
+        and all(isinstance(images_i, (list, tuple)) for images_i in images)
+        and all(is_valid_list_of_images(images_i) or not images_i for images_i in images)
+    ):
+        return images
+    # If it's a list of images, it's a single batch, so convert it to a list of lists
+    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
+        if is_pil_image(images[0]) or images[0].ndim == expected_ndims:
+            return [images]
+        if images[0].ndim == expected_ndims + 1:
+            return [list(image) for image in images]
+    # If it's a single image, convert it to a list of lists
+    if is_valid_image(images):
+        if is_pil_image(images) or images.ndim == expected_ndims:
+            return [[images]]
+        if images.ndim == expected_ndims + 1:
+            return [list(images)]
+    raise ValueError("Invalid input type. Must be a single image, a list of images, or a list of batches of images.")
+def to_numpy_array(img) -> np.ndarray:
+    if not is_valid_image(img):
+        raise ValueError(f"Invalid image type: {type(img)}")
+    if is_vision_available() and isinstance(img, PIL.Image.Image):
+        return np.array(img)
+    return to_numpy(img)
+def infer_channel_dimension_format(
+    image: np.ndarray, num_channels: int | tuple[int, ...] | None = None
+) -> ChannelDimension:
+    """
+    Infers the channel dimension format of `image`.
+    Args:
+        image (`np.ndarray`):
+            The image to infer the channel dimension of.
+        num_channels (`int` or `tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
+            The number of channels of the image.
+    Returns:
+        The channel dimension of the image.
+    """
+    num_channels = num_channels if num_channels is not None else (1, 3)
+    num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
+    if image.ndim == 3:
+        first_dim, last_dim = 0, 2
+    elif image.ndim == 4:
+        first_dim, last_dim = 1, 3
+    elif image.ndim == 5:
+        first_dim, last_dim = 2, 4
+    else:
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
+    if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
+        logger.warning(
+            f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension."
+        )
+        return ChannelDimension.FIRST
+    elif image.shape[first_dim] in num_channels:
+        return ChannelDimension.FIRST
+    elif image.shape[last_dim] in num_channels:
+        return ChannelDimension.LAST
+    raise ValueError("Unable to infer channel dimension format")
+def get_channel_dimension_axis(image: np.ndarray, input_data_format: ChannelDimension | str | None = None) -> int:
+    """
+    Returns the channel dimension axis of the image.
+    Args:
+        image (`np.ndarray`):
+            The image to get the channel dimension axis of.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
+    Returns:
+        The channel dimension axis of the image.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    if input_data_format == ChannelDimension.FIRST:
+        return image.ndim - 3
+    elif input_data_format == ChannelDimension.LAST:
+        return image.ndim - 1
+    raise ValueError(f"Unsupported data format: {input_data_format}")
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension | None = None) -> tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the image.
+    Args:
+        image (`np.ndarray`):
+            The image to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
+    Returns:
+        A tuple of the image's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension_format(image)
+    if channel_dim == ChannelDimension.FIRST:
+        return image.shape[-2], image.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return image.shape[-3], image.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+def get_image_size_for_max_height_width(
+    image_size: tuple[int, int],
+    max_height: int,
+    max_width: int,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+    Args:
+        image_size (`tuple[int, int]`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+    """
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+def get_max_height_width(
+    images: list[Union["torch.Tensor", np.ndarray]], input_data_format: str | ChannelDimension = ChannelDimension.FIRST
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+def is_valid_annotation_coco_detection(annotation: dict[str, list | tuple]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "annotations" in annotation
+        and isinstance(annotation["annotations"], (list, tuple))
+        and (
+            # an image can have no annotations
+            len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
+        )
+    ):
+        return True
+    return False
+def is_valid_annotation_coco_panoptic(annotation: dict[str, list | tuple]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "segments_info" in annotation
+        and "file_name" in annotation
+        and isinstance(annotation["segments_info"], (list, tuple))
+        and (
+            # an image can have no segments
+            len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
+        )
+    ):
+        return True
+    return False
+def valid_coco_detection_annotations(annotations: Iterable[dict[str, list | tuple]]) -> bool:
+    return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
+def valid_coco_panoptic_annotations(annotations: Iterable[dict[str, list | tuple]]) -> bool:
+    return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
+def load_image(
+    image: Union[str, "PIL.Image.Image"],
+    timeout: float | None = None,
+) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    requires_backends(load_image, ["vision"])
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(BytesIO(httpx.get(image, timeout=timeout, follow_redirects=True).content))
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+            # Try to load as base64
+            try:
+                b64 = base64.decodebytes(image.encode())
+                image = PIL.Image.open(BytesIO(b64))
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
+    elif not isinstance(image, PIL.Image.Image):
+        raise TypeError(
+            "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+@requires(backends=("torchvision",))
+def load_image_as_tensor(
+    image: Union[str, "PIL.Image.Image"],
+    timeout: float | None = None,
+) -> "torch.Tensor":
+    """
+    Loads `image` directly to a `torch.Tensor` using torchvision.
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `torch.Tensor`: A `[C, H, W]` uint8 tensor in RGB channel order.
+    """
+    import torch
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            raw = httpx.get(image, timeout=timeout, follow_redirects=True).content
+            buf = torch.frombuffer(bytearray(raw), dtype=torch.uint8)
+            return decode_image(buf, mode=ImageReadMode.RGB)
+        elif os.path.isfile(image):
+            return decode_image(image, mode=ImageReadMode.RGB)
+        else:
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+            try:
+                raw = base64.decodebytes(image.encode())
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
+            buf = torch.frombuffer(bytearray(raw), dtype=torch.uint8)
+            return decode_image(buf, mode=ImageReadMode.RGB)
+    elif isinstance(image, PIL.Image.Image):
+        image = PIL.ImageOps.exif_transpose(image)
+        return pil_to_tensor(image.convert("RGB"))
+    else:
+        raise TypeError(
+            "Incorrect format used for image. Should be a URL, a local path, a base64 string, or a PIL image."
+        )
+def load_images(
+    images: Union[list, tuple, str, "PIL.Image.Image"], timeout: float | None = None
+) -> Union["PIL.Image.Image", list["PIL.Image.Image"], list[list["PIL.Image.Image"]]]:
+    """Loads images, handling different levels of nesting.
+    Args:
+      images: A single image, a list of images, or a list of lists of images to load.
+      timeout: Timeout for loading images.
+    Returns:
+      A single image, a list of images, a list of lists of images.
+    """
+    if isinstance(images, (list, tuple)):
+        if len(images) and isinstance(images[0], (list, tuple)):
+            return [[load_image(image, timeout=timeout) for image in image_group] for image_group in images]
+        else:
+            return [load_image(image, timeout=timeout) for image in images]
+    else:
+        return load_image(images, timeout=timeout)
+def validate_preprocess_arguments(
+    do_rescale: bool | None = None,
+    rescale_factor: float | None = None,
+    do_normalize: bool | None = None,
+    image_mean: float | list[float] | None = None,
+    image_std: float | list[float] | None = None,
+    do_pad: bool | None = None,
+    pad_size: dict[str, int] | int | None = None,
+    do_center_crop: bool | None = None,
+    crop_size: dict[str, int] | None = None,
+    do_resize: bool | None = None,
+    size: dict[str, int] | None = None,
+    resample: Union["PILImageResampling", "InterpolationMode", int] | None = None,
+):
+    """
+    Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
+    Raises `ValueError` if arguments incompatibility is caught.
+    Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
+    sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
+    existing arguments when possible.
+    """
+    if do_rescale and rescale_factor is None:
+        raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
+    if do_pad and pad_size is None:
+        # Processors pad images using different args depending on the model, so the below check is pointless
+        # but we keep it for BC for now. TODO: remove in v5
+        # Usually padding can be called with:
+        #   - "pad_size/size" if we're padding to specific values
+        #   - "size_divisor" if we're padding to any value divisible by X
+        #   - "None" if we're padding to the maximum size image in batch
+        raise ValueError(
+            "Depending on the model, `size_divisor` or `pad_size` or `size` must be specified if `do_pad` is `True`."
+        )
+    if do_normalize and (image_mean is None or image_std is None):
+        raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
+    if do_center_crop and crop_size is None:
+        raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
+    if do_resize and not (size is not None and resample is not None):
+        raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
+class ImageFeatureExtractionMixin:
+    """
+    Mixin that contain utilities for preparing image features.
+    """
+    def _ensure_format_supported(self, image):
+        if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image):
+            raise ValueError(
+                f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.ndarray` and "
+                "`torch.Tensor` are."
+            )
+    def to_pil_image(self, image, rescale=None):
+        """
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+        needed.
+        Args:
+            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+                The image to convert to the PIL Image format.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
+                default to `True` if the image type is a floating type, `False` otherwise.
+        """
+        self._ensure_format_supported(image)
+        if is_torch_tensor(image):
+            image = image.numpy()
+        if isinstance(image, np.ndarray):
+            if rescale is None:
+                # rescale default to the array being of floating type.
+                rescale = isinstance(image.flat[0], np.floating)
+            # If the channel as been moved to first dim, we put it back at the end.
+            if image.ndim == 3 and image.shape[0] in [1, 3]:
+                image = image.transpose(1, 2, 0)
+            if rescale:
+                image = image * 255
+            image = image.astype(np.uint8)
+            return PIL.Image.fromarray(image)
+        return image
+    def convert_rgb(self, image):
+        """
+        Converts `PIL.Image.Image` to RGB format.
+        Args:
+            image (`PIL.Image.Image`):
+                The image to convert.
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            return image
+        return image.convert("RGB")
+    def rescale(self, image: np.ndarray, scale: float | int) -> np.ndarray:
+        """
+        Rescale a numpy image by scale amount
+        """
+        self._ensure_format_supported(image)
+        return image * scale
+    def to_numpy_array(self, image, rescale=None, channel_first=True):
+        """
+        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+        dimension.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to convert to a NumPy array.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
+            channel_first (`bool`, *optional*, defaults to `True`):
+                Whether or not to permute the dimensions of the image to put the channel dimension first.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = np.array(image)
+        if is_torch_tensor(image):
+            image = image.numpy()
+        rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
+        if rescale:
+            image = self.rescale(image.astype(np.float32), 1 / 255.0)
+        if channel_first and image.ndim == 3:
+            image = image.transpose(2, 0, 1)
+        return image
+    def expand_dims(self, image):
+        """
+        Expands 2-dimensional `image` to 3 dimensions.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to expand.
+        """
+        self._ensure_format_supported(image)
+        # Do nothing if PIL image
+        if isinstance(image, PIL.Image.Image):
+            return image
+        if is_torch_tensor(image):
+            image = image.unsqueeze(0)
+        else:
+            image = np.expand_dims(image, axis=0)
+        return image
+    def normalize(self, image, mean, std, rescale=False):
+        """
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
+        if it's a PIL Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to normalize.
+            mean (`list[float]` or `np.ndarray` or `torch.Tensor`):
+                The mean (per channel) to use for normalization.
+            std (`list[float]` or `np.ndarray` or `torch.Tensor`):
+                The standard deviation (per channel) to use for normalization.
+            rescale (`bool`, *optional*, defaults to `False`):
+                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
+                happen automatically.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image, rescale=True)
+        # If the input image is a PIL image, it automatically gets rescaled. If it's another
+        # type it may need rescaling.
+        elif rescale:
+            if isinstance(image, np.ndarray):
+                image = self.rescale(image.astype(np.float32), 1 / 255.0)
+            elif is_torch_tensor(image):
+                image = self.rescale(image.float(), 1 / 255.0)
+        if isinstance(image, np.ndarray):
+            if not isinstance(mean, np.ndarray):
+                mean = np.array(mean).astype(image.dtype)
+            if not isinstance(std, np.ndarray):
+                std = np.array(std).astype(image.dtype)
+        elif is_torch_tensor(image):
+            import torch
+            if not isinstance(mean, torch.Tensor):
+                if isinstance(mean, np.ndarray):
+                    mean = torch.from_numpy(mean)
+                else:
+                    mean = torch.tensor(mean)
+            if not isinstance(std, torch.Tensor):
+                if isinstance(std, np.ndarray):
+                    std = torch.from_numpy(std)
+                else:
+                    std = torch.tensor(std)
+        if image.ndim == 3 and image.shape[0] in [1, 3]:
+            return (image - mean[:, None, None]) / std[:, None, None]
+        else:
+            return (image - mean) / std
+    def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
+        """
+        Resizes `image`. Enforces conversion of input to PIL.Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to resize.
+            size (`int` or `tuple[int, int]`):
+                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
+                matched to this.
+                If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+                `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
+                this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                The filter to user for resampling.
+            default_to_square (`bool`, *optional*, defaults to `True`):
+                How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
+                square (`size`,`size`). If set to `False`, will replicate
+                [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+                with support for resizing only the smallest edge and providing an optional `max_size`.
+            max_size (`int`, *optional*, defaults to `None`):
+                The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
+                greater than `max_size` after being resized according to `size`, then the image is resized again so
+                that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
+                edge may be shorter than `size`. Only used if `default_to_square` is `False`.
+        Returns:
+            image: A resized `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PILImageResampling.BILINEAR
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+        if isinstance(size, list):
+            size = tuple(size)
+        if isinstance(size, int) or len(size) == 1:
+            if default_to_square:
+                size = (size, size) if isinstance(size, int) else (size[0], size[0])
+            else:
+                width, height = image.size
+                # specified size only for the smallest edge
+                short, long = (width, height) if width <= height else (height, width)
+                requested_new_short = size if isinstance(size, int) else size[0]
+                if short == requested_new_short:
+                    return image
+                new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+                if max_size is not None:
+                    if max_size <= requested_new_short:
+                        raise ValueError(
+                            f"max_size = {max_size} must be strictly greater than the requested "
+                            f"size for the smaller edge size = {size}"
+                        )
+                    if new_long > max_size:
+                        new_short, new_long = int(max_size * new_short / new_long), max_size
+                size = (new_short, new_long) if width <= height else (new_long, new_short)
+        return image.resize(size, resample=resample)
+    def center_crop(self, image, size):
+        """
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
+        size given, it will be padded (so the returned result has the size asked).
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
+                The image to resize.
+            size (`int` or `tuple[int, int]`):
+                The size to which crop the image.
+        Returns:
+            new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
+            height, width).
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(size, tuple):
+            size = (size, size)
+        # PIL Image.size is (width, height) but NumPy array and torch Tensors have (height, width)
+        if is_torch_tensor(image) or isinstance(image, np.ndarray):
+            if image.ndim == 2:
+                image = self.expand_dims(image)
+            image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
+        else:
+            image_shape = (image.size[1], image.size[0])
+        top = (image_shape[0] - size[0]) // 2
+        bottom = top + size[0]  # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+        left = (image_shape[1] - size[1]) // 2
+        right = left + size[1]  # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+        # For PIL Images we have a method to crop directly.
+        if isinstance(image, PIL.Image.Image):
+            return image.crop((left, top, right, bottom))
+        # Check if image is in (n_channels, height, width) or (height, width, n_channels) format
+        channel_first = image.shape[0] in [1, 3]
+        # Transpose (height, width, n_channels) format images
+        if not channel_first:
+            if isinstance(image, np.ndarray):
+                image = image.transpose(2, 0, 1)
+            if is_torch_tensor(image):
+                image = image.permute(2, 0, 1)
+        # Check if cropped area is within image boundaries
+        if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
+            return image[..., top:bottom, left:right]
+        # Otherwise, we may need to pad if the image is too small. Oh joy...
+        new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
+        if isinstance(image, np.ndarray):
+            new_image = np.zeros_like(image, shape=new_shape)
+        elif is_torch_tensor(image):
+            new_image = image.new_zeros(new_shape)
+        top_pad = (new_shape[-2] - image_shape[0]) // 2
+        bottom_pad = top_pad + image_shape[0]
+        left_pad = (new_shape[-1] - image_shape[1]) // 2
+        right_pad = left_pad + image_shape[1]
+        new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+        top += top_pad
+        bottom += top_pad
+        left += left_pad
+        right += left_pad
+        new_image = new_image[
+            ..., max(0, top) : min(new_image.shape[-2], bottom), max(0, left) : min(new_image.shape[-1], right)
+        ]
+        return new_image
+    def flip_channel_order(self, image):
+        """
+        Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
+        `image` to a NumPy array if it's a PIL Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
+                be first.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image)
+        return image[::-1, :, :]
+    def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
+        """
+        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
+        counter clockwise around its centre.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
+                rotating.
+        Returns:
+            image: A rotated `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PIL.Image.NEAREST
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+        return image.rotate(
+            angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
+        )
+def validate_annotations(
+    annotation_format: AnnotationFormat,
+    supported_annotation_formats: tuple[AnnotationFormat, ...],
+    annotations: list[dict],
+) -> None:
+    if annotation_format not in supported_annotation_formats:
+        raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
+    if annotation_format is AnnotationFormat.COCO_DETECTION:
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+    if annotation_format is AnnotationFormat.COCO_PANOPTIC:
+        if not valid_coco_panoptic_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                "the latter being a list of annotations in the COCO format."
+            )
+def validate_kwargs(valid_processor_keys: list[str], captured_kwargs: list[str]):
+    unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
+    if unused_keys:
+        unused_key_str = ", ".join(unused_keys)
+        # TODO raise a warning here instead of simply logging?
+        logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")
+@dataclass()
+class SizeDict:
+    """
+    Hashable dictionary to store image size information.
+    """
+    height: int | None = None
+    width: int | None = None
+    longest_edge: int | None = None
+    shortest_edge: int | None = None
+    max_height: int | None = None
+    max_width: int | None = None
+    def __getitem__(self, key):
+        if hasattr(self, key):
+            return getattr(self, key)
+        raise KeyError(f"Key {key} not found in SizeDict.")
+    def get(self, key, default=None):
+        if hasattr(self, key) and getattr(self, key) is not None:
+            return getattr(self, key)
+        return default
+    def __iter__(self):
+        # Yield only non-None (key, value) pairs so dict(self) excludes missing values.
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if val is not None:
+                yield f.name, val
+    def __hash__(self):
+        return hash((self.height, self.width, self.longest_edge, self.shortest_edge, self.max_height, self.max_width))
+    def __contains__(self, key):
+        return hasattr(self, key) and getattr(self, key) is not None
+    def __setitem__(self, key, value):
+        if not hasattr(self, key):
+            raise KeyError(f"Key {key} is not a valid field of SizeDict.")
+        object.__setattr__(self, key, value)
+    def __eq__(self, other):
+        if isinstance(other, dict):
+            return dict(self) == other
+        if isinstance(other, SizeDict):
+            return tuple(getattr(self, f.name) for f in fields(self)) == tuple(
+                getattr(other, f.name) for f in fields(self)
+            )
+        return NotImplemented
+    def __or__(self, other) -> "SizeDict":
+        if isinstance(other, dict | SizeDict):
+            merged = dict(self)
+            merged.update(dict(other))
+            return SizeDict(**merged)
+        return NotImplemented
+    def __ror__(self, other) -> dict:
+        if isinstance(other, dict):
+            merged = dict(other)
+            merged.update(dict(self))
+            return merged
+        return NotImplemented

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/masking_utils.py ADDED Viewed

	@@ -0,0 +1,1514 @@

+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+import torch
+import torch.nn.functional as F
+from .cache_utils import Cache
+from .configuration_utils import PreTrainedConfig
+from .utils import is_torch_xpu_available, logging
+from .utils.generic import GeneralInterface, is_flash_attention_requested
+from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_tracing
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size
+    from torch.nn.attention.flex_attention import BlockMask, create_block_mask
+else:
+    # Register a fake type to avoid crashing for annotations and `isinstance` checks
+    BlockMask = torch.Tensor
+_is_torch_greater_or_equal_than_2_5 = is_torch_greater_or_equal("2.5", accept_dev=True)
+_is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
+_is_torch_xpu_available = is_torch_xpu_available()
+if _is_torch_greater_or_equal_than_2_6:
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+logger = logging.get_logger(__name__)
+def and_masks(*mask_functions: Callable) -> Callable:
+    """Returns a mask function that is the intersection of provided mask functions"""
+    if not all(callable(arg) for arg in mask_functions):
+        raise RuntimeError(f"All inputs should be callable mask_functions: {mask_functions}")
+    def and_mask(batch_idx, head_idx, q_idx, kv_idx):
+        result = q_idx.new_ones((), dtype=torch.bool)
+        for mask in mask_functions:
+            result = result & mask(batch_idx, head_idx, q_idx, kv_idx).to(result.device)
+        return result
+    return and_mask
+def or_masks(*mask_functions: Callable) -> Callable:
+    """Returns a mask function that is the union of provided mask functions"""
+    if not all(callable(arg) for arg in mask_functions):
+        raise RuntimeError(f"All inputs should be callable mask_functions: {mask_functions}")
+    def or_mask(batch_idx, head_idx, q_idx, kv_idx):
+        result = q_idx.new_zeros((), dtype=torch.bool)
+        for mask in mask_functions:
+            result = result | mask(batch_idx, head_idx, q_idx, kv_idx).to(result.device)
+        return result
+    return or_mask
+def causal_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+    """
+    This creates a basic lower-diagonal causal mask.
+    """
+    return kv_idx <= q_idx
+def bidirectional_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+    """
+    This creates a full bidirectional mask.
+    NOTE: It is important to keep an index-based version for non-vmap expansion.
+    """
+    return q_idx >= 0
+def sliding_window_overlay(sliding_window: int) -> Callable:
+    """
+    This is an overlay depicting a sliding window pattern. Add it on top of a causal mask for a proper sliding
+    window mask.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return kv_idx > q_idx - sliding_window
+    return inner_mask
+def chunked_overlay(chunk_size: int, left_padding: torch.Tensor) -> Callable:
+    """
+    This is an overlay depicting a chunked attention pattern. Add it on top of a causal mask for a proper chunked
+    attention mask.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return (kv_idx - left_padding[batch_idx]) // chunk_size == (q_idx - left_padding[batch_idx]) // chunk_size
+    return inner_mask
+def blockwise_overlay(block_sequence_ids: torch.Tensor) -> Callable:
+    """
+    This is an overlay depicting a blockwise masking pattern. Instead of a single
+    token, each block consists of arbitrary length tokens. In causal setup, each block
+    can attend to prev block causally and can't attend to future blocks. Within one block
+    the attention is always bidirectional.
+    Mostly used in MLLMs when non-text data attends bidirectionally to itself.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # Unmask if the q and kv come from same group which is not -1 (i.e. non-text)
+        q_group = block_sequence_ids[batch_idx, q_idx]
+        kv_group = block_sequence_ids[batch_idx, kv_idx]
+        return (q_group == kv_group) & (q_group >= 0)
+    return inner_mask
+def sliding_window_causal_mask_function(sliding_window: int) -> Callable:
+    """
+    This return the mask_function function to create a sliding window mask.
+    """
+    return and_masks(sliding_window_overlay(sliding_window), causal_mask_function)
+def sliding_window_bidirectional_overlay(sliding_window: int) -> Callable:
+    """
+    This is an overlay depicting a bidirectional sliding window pattern.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        """A token can attend to any other token if their absolute distance is within
+        the (inclusive) sliding window size (distance <= sliding_window)."""
+        return abs(q_idx - kv_idx) <= sliding_window
+    return inner_mask
+def sliding_window_bidirectional_mask_function(sliding_window: int) -> Callable:
+    """
+    This return the mask_function function to create a bidirectional sliding window mask.
+    """
+    return and_masks(sliding_window_bidirectional_overlay(sliding_window), bidirectional_mask_function)
+def chunked_causal_mask_function(chunk_size: int, left_padding: torch.Tensor) -> Callable:
+    """
+    This return the mask_function function to create a chunked attention mask.
+    """
+    return and_masks(chunked_overlay(chunk_size, left_padding), causal_mask_function)
+def padding_mask_function(padding_mask: torch.Tensor) -> Callable:
+    """
+    This return the mask_function function corresponding to a 2D padding mask.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # Note that here the mask should ALWAYS be at least of the max `kv_index` size in the dimension 1. This is because
+        # we cannot pad it here in the mask_function as we don't know the final size, and we cannot try/except, as it is not
+        # vectorizable on accelerator devices
+        return padding_mask[batch_idx, kv_idx]
+    return inner_mask
+def packed_sequence_mask_function(packed_sequence_mask: torch.Tensor) -> Callable:
+    """
+    This return the mask_function function corresponding to a 2D packed sequence mask.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return packed_sequence_mask[batch_idx, q_idx] == packed_sequence_mask[batch_idx, kv_idx]
+    return inner_mask
+def add_offsets_to_mask_function(mask_function: Callable, q_offset: int, kv_offset: int) -> Callable:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return mask_function(batch_idx, head_idx, q_idx + q_offset, kv_idx + kv_offset)
+    return inner_mask
+def prepare_padding_mask(attention_mask: torch.Tensor | None, kv_length: int, kv_offset: int) -> torch.Tensor | None:
+    """
+    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it.
+    """
+    local_padding_mask = attention_mask
+    if attention_mask is not None:
+        # Pad it if necessary
+        if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
+            local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length))
+    return local_padding_mask
+def maybe_pad_block_sequence_ids(
+    block_sequence_ids: torch.Tensor, attention_mask: torch.Tensor | None, kv_length: int, kv_offset: int
+) -> torch.Tensor:
+    """
+    Pads the `block_sequence_ids` in case the total length is less than `kv_length`.
+    Usually that happens with `StaticCache` generation or generating without cache.
+    Pads to the right with `-1`.
+    """
+    if (padding_length := kv_length + kv_offset - block_sequence_ids.shape[-1]) > 0:
+        block_sequence_ids = F.pad(block_sequence_ids, pad=(0, padding_length), value=-1)
+    return block_sequence_ids
+def _can_skip_causal_mask_xpu(
+    padding_mask: torch.Tensor | None,
+    query_length: int,
+    kv_length: int,
+    local_attention_size: int | None,
+) -> bool:
+    """
+    XPU-specific logic for determining if we can skip causal mask creation.
+    For XPU devices, we have special handling:
+    - Single query tokens (query_length == 1) use the same logic as CUDA
+    - Multi-query tokens can skip if padding_mask is provided and correctly structured
+      The mask must have all True values in the query window and all False after
+    """
+    if is_tracing(padding_mask):
+        return False
+    # Check local attention constraint (same as CUDA)
+    if local_attention_size is not None and kv_length >= local_attention_size:
+        return False
+    if padding_mask is None:
+        # Without padding mask, can skip if single query token or full causal attention
+        return query_length == 1 or kv_length == query_length
+    # XPU allows skipping under additional conditions when padding_mask is provided
+    if query_length == 1:
+        # Single query token: skip only if no padding tokens present
+        return padding_mask.all()
+    # XPU-specific: check if query window is all True and rest is all False
+    # This allows XPU to optimize the 1st token in static cache
+    return padding_mask[:, :query_length].all() and not padding_mask[:, query_length:].any()
+def _ignore_causal_mask_sdpa(
+    padding_mask: torch.Tensor | None,
+    query_length: int,
+    kv_length: int,
+    kv_offset: int,
+    local_attention_size: int | None = None,
+) -> bool:
+    """
+    Detects whether the causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.
+    In case no token is masked in the 2D `padding_mask` argument, if `query_length == 1` or
+    `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
+    passed).
+    """
+    if padding_mask is not None and padding_mask.shape[-1] > kv_length:
+        mask_indices = torch.arange(kv_length, device=padding_mask.device)
+        mask_indices += kv_offset
+        padding_mask = padding_mask[:, mask_indices]
+    if _is_torch_xpu_available:
+        # XPU devices have special handling for mask skipping:
+        # - Single query tokens use the same logic as CUDA
+        # - Multi-query tokens can skip if padding_mask is provided and correctly structured
+        #   (all True in query window, all False after)
+        return _can_skip_causal_mask_xpu(padding_mask, query_length, kv_length, local_attention_size)
+    # When using `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
+    # hard-coded to the forward. If a user exports a model with query_length > 1, the exported model will hard-code `is_causal=True`
+    # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). Thus, we only set
+    # `ignore_causal_mask = True` if we are not tracing
+    if (
+        not is_tracing(padding_mask)
+        # only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
+        and (query_length == 1 or kv_length == query_length)
+        # in this case we need to add special patterns to the mask so cannot be skipped otherwise
+        and (local_attention_size is None or kv_length < local_attention_size)
+        # In this case, we need to add padding to the mask, so cannot be skipped otherwise
+        and (padding_mask is None or padding_mask.all())
+    ):
+        return True
+    return False
+def _can_skip_bidirectional_mask_xpu(
+    padding_mask: torch.Tensor | None,
+    kv_length: int,
+    local_attention_size: int | None,
+) -> bool:
+    """
+    XPU-specific logic for determining if we can skip bidirectional mask creation.
+    For XPU devices, we have special handling:
+    - Skip if no padding and no local attention constraint
+    """
+    if is_tracing(padding_mask):
+        return False
+    # Check local attention constraint (same as CUDA)
+    if local_attention_size is not None and kv_length >= local_attention_size:
+        return False
+    if padding_mask is None:
+        # Without padding mask, can always skip for full bidirectional attention
+        return True
+    # Skip only if no padding tokens present
+    return padding_mask.all()
+def _ignore_bidirectional_mask_sdpa(
+    padding_mask: torch.Tensor | None,
+    kv_length: int,
+    local_attention_size: int | None = None,
+) -> bool:
+    """
+    Detects whether the bidirectional mask can be ignored in case PyTorch's SDPA is used.
+    In case no token is masked in the 2D `padding_mask` argument and no local attention constraint applies
+    (i.e. `local_attention_size` is None or `kv_length < local_attention_size`), we skip mask creation,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
+    passed).
+    """
+    if _is_torch_xpu_available:
+        # XPU devices have special handling for mask skipping:
+        # - Skip if no padding and no local attention constraint
+        return _can_skip_bidirectional_mask_xpu(padding_mask, kv_length, local_attention_size)
+    # When using `torch.export` or `torch.onnx.dynamo_export`, we need to avoid to check the contents of the mask;
+    # otherwise, we will encounter dynamic control flows
+    if (
+        not is_tracing(padding_mask)
+        and (padding_mask is None or padding_mask.all())
+        # in this case we need to add special patterns to the mask so cannot be skipped otherwise
+        and (local_attention_size is None or kv_length < local_attention_size)
+    ):
+        return True
+    return False
+def _vmap_expansion_sdpa(mask_function: Callable) -> Callable:
+    """
+    Used to vmap our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
+    Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
+    functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
+    """
+    # We vmap the function over all 4 dimensions, broadcasting [b_idx, h_idx, q_idx, kv_idx]
+    dimensions = [(None, None, None, 0), (None, None, 0, None), (None, 0, None, None), (0, None, None, None)]
+    for dims in dimensions:
+        mask_function = torch.vmap(mask_function, in_dims=dims, out_dims=0)
+    return mask_function
+def _non_vmap_expansion_sdpa(
+    batch_indices: torch.Tensor, head_indices: torch.Tensor, q_indices: torch.Tensor, kv_indices: torch.Tensor
+):
+    """
+    Used to broadcast our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
+    Allows the usage of any index-based mask function without relying on vmap.
+    NOTE: This is limited to index based functions only and is not guaranteed to work otherwise.
+    Reference:
+        - https://github.com/huggingface/optimum-onnx/blob/c123e8f4fab61b54a8e0e31ce74462bcacca576e/optimum/exporters/onnx/model_patcher.py#L362-L365
+    """
+    batch_indices = batch_indices[:, None, None, None]
+    head_indices = head_indices[None, :, None, None]
+    q_indices = q_indices[None, None, :, None]
+    kv_indices = kv_indices[None, None, None, :]
+    return batch_indices, head_indices, q_indices, kv_indices
+def sdpa_mask(
+    batch_size: int,
+    q_length: int,
+    kv_length: int,
+    q_offset: int = 0,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: torch.Tensor | None = None,
+    local_size: int | None = None,
+    allow_is_causal_skip: bool = True,
+    allow_is_bidirectional_skip: bool = False,
+    allow_torch_fix: bool = True,
+    use_vmap: bool = False,
+    device: torch.device | str = "cpu",
+    **kwargs,
+) -> torch.Tensor | None:
+    """
+    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
+    the element should take part in the attention computation, and False that it should not.
+    This function can only be used with torch>=2.5, as the context manager is otherwise not available.
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        q_length (`int`):
+            The size that the query states will have during the attention computation.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        q_offset (`int`, optional):
+            An optional offset to indicate at which first position the query states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_is_bidirectional_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
+            i.e. full attention without any padding. Default to `False`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+        use_vmap (`bool`, optional):
+            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
+            index-based (for the cost of speed performance). By default `False`.
+        device (`torch.device` or `str`, optional):
+            An optional device to create the mask on.
+    ## Creating a simple causal mask:
+    To create the following causal mask:
+        0 ■ ⬚ ⬚ ⬚ ⬚
+        1 ■ ■ ⬚ ⬚ ⬚
+        2 ■ ■ ■ ⬚ ⬚
+        3 ■ ■ ■ ■ ⬚
+        4 ■ ■ ■ ■ ■
+    You can do
+    ```python
+    >>> sdpa_mask(batch_size=1, q_length=5, kv_length=5)
+    >>> tensor([[[[ True, False, False, False, False],
+                  [ True,  True, False, False, False],
+                  [ True,  True,  True, False, False],
+                  [ True,  True,  True,  True, False],
+                  [ True,  True,  True,  True,  True]]]])
+    ```
+    ## Creating a sliding window mask:
+    To create the following sliding window mask (`sliding_window=3`):
+        0 ■ ⬚ ⬚ ⬚ ⬚
+        1 ■ ■ ⬚ ⬚ ⬚
+        2 ■ ■ ■ ⬚ ⬚
+        3 ⬚ ■ ■ ■ ⬚
+        4 ⬚ ⬚ ■ ■ ■
+    You can do
+    ```python
+    >>> sdpa_mask(batch_size=1, q_length=5, kv_length=5, mask_function=sliding_window_causal_mask_function(3))
+    >>> tensor([[[[ True, False, False, False, False],
+                  [ True,  True, False, False, False],
+                  [ True,  True,  True, False, False],
+                  [False,  True,  True,  True, False],
+                  [False, False,  True,  True,  True]]]])
+    ```
+    ## Creating a chunked attention mask
+    To create the following chunked attention mask (`chunk_size=3`):
+        0 ■ ⬚ ⬚ ⬚ ⬚
+        1 ■ ■ ⬚ ⬚ ⬚
+        2 ■ ■ ■ ⬚ ⬚
+        3 ⬚ ⬚ ⬚ ■ ⬚
+        4 ⬚ ⬚ ⬚ ■ ■
+    You can do
+    ```python
+    >>> sdpa_mask(batch_size=1, q_length=5, kv_length=5, mask_function=chunked_causal_mask_function(3, torch.zeros(1, dtype=int)))
+    >>> tensor([[[[ True, False, False, False, False],
+                [ True,  True, False, False, False],
+                [ True,  True,  True, False, False],
+                [False, False, False,  True, False],
+                [False, False, False,  True,  True]]]])
+    ```
+    """
+    # Potentially pad the 2D mask
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+    # Under specific conditions, we can avoid materializing the mask
+    #   1. Causal masks can rely on the `is_causal` argument
+    #   2. Bidirectional do not need any further processing (no bias)
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, kv_offset, local_size):
+        return None
+    if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask, kv_length, local_size):
+        return None
+    # Potentially add the padding 2D mask
+    if padding_mask is not None:
+        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+    batch_arange = torch.arange(batch_size, device=device)
+    head_arange = torch.arange(1, device=device)
+    q_arange = torch.arange(q_length, device=device) + q_offset
+    kv_arange = torch.arange(kv_length, device=device) + kv_offset
+    # Actual mask creation
+    # Option 1: Fast non-vmap mask creation (default)
+    if not use_vmap:
+        # Apply mask function element-wise through broadcasting
+        attention_mask = mask_function(*_non_vmap_expansion_sdpa(batch_arange, head_arange, q_arange, kv_arange))
+        # Expand the mask to match batch size and query length if they weren't used in the mask function
+        attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
+    # Option 2: Vmap mask creation (torch>=2.6 and custom patterns)
+    elif _is_torch_greater_or_equal_than_2_6:
+        # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
+        # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
+        # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
+        with TransformGetItemToIndex():
+            attention_mask = _vmap_expansion_sdpa(mask_function)(batch_arange, head_arange, q_arange, kv_arange)
+    # Option 3: Error out since it indicates that the user did something custom, which they shouldn't have (torch<2.6)
+    else:
+        raise ValueError(
+            "The vmap functionality for mask creation is only supported from torch>=2.6. "
+            "Please update your torch version or use `use_vmap=False` with index-based masks."
+        )
+    # Due to a bug in versions of torch<2.5, we need to update the mask in case a query is not attending to any
+    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
+    if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
+        attention_mask = attention_mask | torch.all(~attention_mask, dim=-1, keepdim=True)
+    return attention_mask
+def eager_mask(
+    batch_size: int,
+    q_length: int,
+    kv_length: int,
+    q_offset: int = 0,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: torch.Tensor | None = None,
+    dtype: torch.dtype = torch.float32,
+    allow_is_bidirectional_skip: bool = False,
+    use_vmap: bool = False,
+    device: torch.device | str = "cpu",
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Create a 4D float mask of shape `(batch_size, 1, query_length, kv_length)` where a value of 0 indicates that
+    the element should take part in the attention computation, and -inf (minimum value for the given `dtype`) that
+    it should not.
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        q_length (`int`):
+            The size that the query states will have during the attention computation.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        q_offset (`int`, optional):
+            An optional offset to indicate at which first position the query states will refer to.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        dtype (`torch.dtype`, optional):
+            The dtype to use for the mask. By default, `torch.float32`.
+        allow_is_bidirectional_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
+            i.e. full attention without any padding. Default to `False`.
+        use_vmap (`bool`, optional):
+            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
+            index-based (for the cost of speed performance). By default `False`.
+        device (`torch.device` or `str`, optional):
+            An optional device to create the mask on.
+    """
+    # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
+    _ = kwargs.pop("allow_is_causal_skip", None)
+    _ = kwargs.pop("allow_torch_fix", None)
+    mask = sdpa_mask(
+        batch_size=batch_size,
+        q_length=q_length,
+        kv_length=kv_length,
+        q_offset=q_offset,
+        kv_offset=kv_offset,
+        mask_function=mask_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=False,
+        allow_is_bidirectional_skip=allow_is_bidirectional_skip,
+        allow_torch_fix=False,
+        use_vmap=use_vmap,
+        device=device,
+        **kwargs,
+    )
+    # only bidirectional masks can be skipped, otherwise we convert bool -> float
+    if mask is not None:
+        min_dtype = torch.finfo(dtype).min
+        # we need 0s where the tokens should be taken into account, and -inf otherwise (mask is already of boolean type)
+        mask = torch.where(mask, torch.tensor(0.0, device=mask.device, dtype=dtype), min_dtype)
+    return mask
+def flash_attention_mask(
+    batch_size: int,
+    q_length: int,
+    kv_length: int,
+    q_offset: int = 0,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: torch.Tensor | None = None,
+    **kwargs,
+):
+    """
+    Create the attention mask necessary to use FA2. Since FA2 is un-padded by definition, here we simply return
+    `None` if the mask is fully causal, or we return the 2D mask which will then be used to extract the seq_lens.
+    We just slice it in case of sliding window.
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        q_length (`int`):
+            The size that the query states will have during the attention computation.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        q_offset (`int`, optional):
+            An optional offset to indicate at which first position the query states will refer to.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+    """
+    if attention_mask is not None:
+        # Here we need to slice from the right if using sliding or chunked (for full attention, this is equivalent to doing nothing)
+        attention_mask = attention_mask[:, -kv_length:]
+        # We only return an actual mask if there is at least 1 padding token, otherwise we return `None` and use `is_causal` in FA2
+        # (note that the attention_mask is a boolean dtype here)
+        if attention_mask.all():
+            attention_mask = None
+    return attention_mask
+def flex_attention_mask(
+    batch_size: int,
+    q_length: int,
+    kv_length: int,
+    q_offset: int = 0,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: torch.Tensor | None = None,
+    device: torch.device | str = "cpu",
+    **kwargs,
+) -> BlockMask:
+    """
+    Create a 4D block mask which is a compressed representation of the full 4D block causal mask. BlockMask is essential
+    for performant computation of flex attention. See: https://pytorch.org/blog/flexattention/
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        q_length (`int`):
+            The size that the query states will have during the attention computation.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        q_offset (`int`, optional):
+            An optional offset to indicate at which first position the query states will refer to.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        device (`torch.device` or `str`, optional):
+            An optional device to create the mask on.
+    """
+    # Potentially add the padding 2D mask
+    if attention_mask is not None:
+        # Older torch (2.5.x) cannot handle sequences not in multiples of 128 (default block size)
+        # Hence we pad to multiples of this as a minimum to ensure this
+        pad_len = ((attention_mask.shape[1] // flex_default_block_size) + 1) * flex_default_block_size
+        pad_len = pad_len - attention_mask.shape[1]
+        if not _is_torch_greater_or_equal_than_2_6 and pad_len > 0:
+            attention_mask = torch.nn.functional.pad(attention_mask, value=0, pad=(0, pad_len))
+        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+    # Add the offsets on top (because flex interface only allows length, not start and end indices)
+    mask_function = add_offsets_to_mask_function(mask_function, q_offset, kv_offset)
+    # Finally create the block mask
+    block_mask = create_block_mask(
+        mask_mod=mask_function,
+        B=batch_size,
+        H=None,
+        Q_LEN=q_length,
+        KV_LEN=kv_length,
+        device=device,
+        _compile=_is_torch_greater_or_equal_than_2_6,
+    )
+    return block_mask
+class AttentionMaskInterface(GeneralInterface):
+    # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
+    # a new instance is created (in order to locally override a given function)
+    _global_mapping = {
+        "sdpa": sdpa_mask,
+        "eager": eager_mask,
+        "flash_attention_2": flash_attention_mask,
+        "flash_attention_3": flash_attention_mask,
+        "flash_attention_4": flash_attention_mask,
+        "flex_attention": flex_attention_mask,
+    }
+# Global AttentionMaskInterface shared by all models which do not need to overwrite any of the existing ones
+ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
+def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor | None:
+    """
+    Find the indices of the sequence to which each new query token in the sequence belongs when using packed
+    tensor format (i.e. several sequences packed in the same batch dimension).
+    Args:
+        position_ids (`torch.Tensor`)
+            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
+    Returns:
+        A 2D tensor where each similar integer indicates that the tokens belong to the same sequence. For example, if we
+        pack 3 sequences of 2, 3 and 1 tokens respectively along a single batch dim, this will return [[0, 0, 1, 1, 1, 2]].
+        If the there is only one sequence in each batch item (and we don't compile), then we return `None` indicating
+        no packed sequences. This is the same as [[0, 0, 0, 0, 0, 0]] for the example above.
+    """
+    # What separate different sequences is when 2 consecutive positions_ids are separated by more than 1. So
+    # taking the diff (by prepending the first value - 1 to keep correct indexing) and applying cumsum to the result
+    # gives exactly the sequence indices
+    # Note that we assume that a single sequence cannot span several batch dimensions, i.e. 1 single sequence
+    # cannot be part of the end of the first batch dim and the start of the 2nd one for example
+    first_dummy_value = position_ids[:, :1] - 1  # We just need the diff on this first value to be 1
+    position_diff = torch.diff(position_ids, prepend=first_dummy_value, dim=-1)
+    packed_sequence_mask = (position_diff != 1).cumsum(-1)
+    # Sadly this is a dynamic control flow, so we cannot enable this check on anything compile related
+    if not is_tracing(packed_sequence_mask) and (packed_sequence_mask[:, -1] == 0).all():
+        return None
+    return packed_sequence_mask
+def _preprocess_mask_arguments(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | BlockMask | None,
+    past_key_values: Cache | None,
+    position_ids: torch.Tensor | None,
+    layer_idx: int | None,
+    encoder_hidden_states: torch.Tensor | None = None,
+) -> tuple[bool, torch.Tensor | BlockMask | None, int, int]:
+    """
+    Perform some common pre-processing of the mask arguments we get from the modeling code. Mostly determine the
+    key-value length and offsets, and if we should early exit or not.
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        position_ids (`torch.Tensor`, optional)
+            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
+        layer_idx (`int`, optional):
+            If `past_key_values` is not None, this is the layer index of the cache from which to get the key-value
+            length and offset. Indeed, for hybrid caches, different layers may return different lengths.
+        encoder_hidden_states (`torch.Tensor`, optional):
+            The input embeddings of shape (batch_size, kv_length, hidden_dim). If provided, it is used instead of
+            `inputs_embeds` to infer the kv length.
+    Returns:
+        early_exit (`bool`):
+            Whether we should early exit mask creation, and return the mask as-is.
+        attention_mask (`torch.Tensor` or `BlockMask` or `None`):
+            The attention mask to either return immediately, or to use in downstream mask creation.
+        packed_sequence_mask (`torch.Tensor`, optional):
+            In case we detected packed sequence format, this is a tensor where each similar integer indicates that
+            the tokens belong to the same sequence.
+        q_length (`int`):
+            The size that the query states will have during the attention computation.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        q_offset (`int`, optional):
+            An optional offset to indicate at which first position the query states will refer to.
+        kv_offset (`int`):
+            An offset to indicate at which first position the key and values states will refer to.
+    """
+    # If the mask is already 4D, simply return as-is (it was already prepared, or it is custom)
+    if isinstance(attention_mask, (torch.Tensor, BlockMask)) and len(attention_mask.shape) == 4:
+        return True, attention_mask, None, None, None, None, None
+    # For TGI/vLLM backends, or other custom attention without equivalent mask creation: we don't need a mask!
+    # Note: it's not ideal to check the `_global_mapping` attribute instead of the object itself, however otherwise
+    # full graph dynamo tracing (i.e. torch.export or compile with `fullgraph=True`) will fail on Python<3.11
+    # with `torch._dynamo.exc.Unsupported: 'inline in skipfiles:Mapping.__contains__ | __contains__, skipped
+    # according trace_rules.lookup SKIP_DIRS'` -- can be removed when we require Python>=3.11
+    if config._attn_implementation not in ALL_MASK_ATTENTION_FUNCTIONS._global_mapping:
+        return True, None, None, None, None, None, None
+    # Move the mask to correct device, and potentially switch dtype for efficiency
+    if attention_mask is not None and attention_mask.ndim == 2:
+        attention_mask = attention_mask.to(device=inputs_embeds.device, dtype=torch.bool)
+    q_length = inputs_embeds.shape[1]
+    # If using a cache, it can give all information about mask sizes based on seen tokens
+    if past_key_values is not None:
+        q_offset = past_key_values.get_seq_length()
+        # To avoid graph breaks, StaticLayer return a tensor instead of int -> this has no impact on the ops, but we
+        # need the correct device
+        q_offset = q_offset.to(inputs_embeds.device) if isinstance(q_offset, torch.Tensor) else q_offset
+        kv_length, kv_offset = past_key_values.get_mask_sizes(q_length, layer_idx)
+    # Otherwise, we infer based on our input
+    else:
+        q_offset = 0
+        # 1. Rely on input directly
+        if attention_mask is None:
+            # For encoder-decoders, use encoder_hidden_states to infer kv_length if provided
+            kv_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else q_length
+            kv_offset = 0
+        # 2. Rely on the mask instead - needed for special cases like prefix tuning in PEFT
+        #
+        # This is a very unique and special case where an encoder utilizes a cache and expects its length
+        # to be accounted for (usually, they should never use a cache). In general, the mask should always
+        # match with the input sizes nonetheless (i.e. it does not affect others).
+        # Conclusion: "prefix tuning is evil"
+        else:
+            kv_length, kv_offset = attention_mask.shape[-1], 0
+    # We check the position_ids for potential packed sequence format (only if the 2D attention mask is explicitly None,
+    # and we don't have past_key_values, i.e. generally a training setup)
+    packed_sequence_mask = None
+    if position_ids is not None and attention_mask is None and past_key_values is None:
+        batch_size = inputs_embeds.shape[0]
+        # The position ids are sometimes just unsqueezed, without being expanded
+        if batch_size != position_ids.shape[0]:
+            position_ids = position_ids.expand(batch_size, -1)
+        packed_sequence_mask = find_packed_sequence_indices(position_ids)
+    return False, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset
+def create_causal_mask(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    past_key_values: Cache | None,
+    position_ids: torch.Tensor | None = None,
+    or_mask_function: Callable | None = None,
+    and_mask_function: Callable | None = None,
+    block_sequence_ids: torch.Tensor | None = None,
+) -> torch.Tensor | BlockMask | None:
+    """
+    Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
+    has an hybrid cache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
+    to what is needed in the `modeling_xxx.py` files).
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        cache_position (`torch.Tensor`):
+            Deprecated and unused.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        position_ids (`torch.Tensor`, optional)
+            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the causal mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+        block_sequence_ids (`torch.Tensor`, *optional*):
+            A tensor of same shape as input IDs indicating to which block or group each token belongs to. Tokens from
+            the same block will keep a bidirectional mask within the block, attending causally to the past. Index `-1`
+            can be used for blocks that have to keep complete causality within itself.
+    """
+    # Power feature: if `is_causal` is False, then fallback to bi-directional mask for bi-directional attention.
+    # It allows to use decoder-only models with bi-directional attention as well
+    if not getattr(config, "is_causal", True):
+        return create_bidirectional_mask(
+            config,
+            inputs_embeds,
+            attention_mask,
+            past_key_values=past_key_values,
+            or_mask_function=or_mask_function,
+            and_mask_function=and_mask_function,
+        )
+    # If we have an hybrid cache structure, here we want to create the mask for the full layers
+    if hasattr(past_key_values, "is_sliding") and False in past_key_values.is_sliding:
+        layer_idx = past_key_values.is_sliding.index(False)
+    else:
+        layer_idx = 0
+    early_exit, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset = (
+        _preprocess_mask_arguments(config, inputs_embeds, attention_mask, past_key_values, position_ids, layer_idx)
+    )
+    if early_exit:
+        return attention_mask
+    batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
+    mask_factory_function = causal_mask_function
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    # Defaulting to using non-vmap based mask creations except when detecting
+    # users passing custom mask functions (as we cannot guarantee that they
+    # are properly index-based as required by our implementation).
+    use_vmap = False
+    # Do not allow skip if we are compiling (this is to match BC)
+    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
+    if _is_torch_xpu_available:
+        # Do not allow skip if we are compiling for decoding, but for prefill, we still allow skip to optimization the perf of 1st token generation
+        allow_is_causal_skip = not (getattr(past_key_values, "is_compileable", False) and q_length == 1)
+    else:
+        allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
+    # Allow slight deviations from causal mask
+    # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
+    # padding mask, etc) as the resulting mask may otherwise not be correct!
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+        use_vmap = True
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
+        use_vmap = True
+    # If we detected packing format or blockwise overlay
+    if packed_sequence_mask is not None:
+        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
+        allow_is_causal_skip = False
+    if block_sequence_ids is not None:
+        block_sequence_ids = maybe_pad_block_sequence_ids(block_sequence_ids, attention_mask, kv_length, kv_offset)
+        mask_factory_function = or_masks(mask_factory_function, blockwise_overlay(block_sequence_ids))
+        allow_is_causal_skip = False
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        q_length=q_length,
+        kv_length=kv_length,
+        q_offset=q_offset,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
+        device=device,
+    )
+    return causal_mask
+def create_bidirectional_mask(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    encoder_hidden_states: torch.Tensor | None = None,
+    past_key_values: Cache | None = None,
+    or_mask_function: Callable | None = None,
+    and_mask_function: Callable | None = None,
+    **kwargs,
+) -> torch.Tensor | BlockMask | None:
+    """
+    Create a standard bidirectional mask based on the attention implementation used (stored in the config).
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is only used to infer metadata
+            such as the batch size, query length, dtype, and device.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, kv_length).
+            It can also be an already prepared 4D mask of shape (batch_size, 1, query_length, kv_length),
+            in which case it is returned as-is.
+        encoder_hidden_states (`torch.Tensor`, optional):
+            The input embeddings of shape (batch_size, kv_length, hidden_dim). If provided, it is used instead of
+            `inputs_embeds` to infer the batch size, kv length and dtype.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the base mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the base mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top, for example for image tokens handling.
+    """
+    # We ignore a few irrelevant arguments at the end as we do not have a (growing) cache here
+    early_exit, attention_mask, _, q_length, kv_length, q_offset, kv_offset = _preprocess_mask_arguments(
+        config, inputs_embeds, attention_mask, past_key_values, None, 0, encoder_hidden_states
+    )
+    if early_exit:
+        return attention_mask
+    embeds = encoder_hidden_states if encoder_hidden_states is not None else inputs_embeds
+    batch_size, dtype, device = embeds.shape[0], embeds.dtype, embeds.device
+    mask_factory_function = bidirectional_mask_function
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    # Allow skipping the mask creation except we have additional masking operators (and/or masks)
+    allow_is_bidirectional_skip = True
+    # Defaulting to using non-vmap based mask creations except when detecting
+    # users passing custom mask functions (as we cannot guarantee that they
+    # are properly index-based as required by our implementation).
+    use_vmap = False
+    # Allow slight deviations from the base mask
+    # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
+    # padding mask, etc) as the resulting mask may otherwise not be correct!
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_bidirectional_skip = False
+        use_vmap = True
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_bidirectional_skip = False
+        use_vmap = True
+    # We now create the mask
+    attention_mask = mask_interface(
+        batch_size=batch_size,
+        q_length=q_length,
+        kv_length=kv_length,
+        q_offset=q_offset,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        # Additional kwargs for sdpa
+        allow_is_causal_skip=False,
+        allow_is_bidirectional_skip=allow_is_bidirectional_skip,
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
+        device=device,
+    )
+    return attention_mask
+def create_sliding_window_causal_mask(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    past_key_values: Cache | None,
+    position_ids: torch.Tensor | None = None,
+    or_mask_function: Callable | None = None,
+    and_mask_function: Callable | None = None,
+    block_sequence_ids: torch.Tensor | None = None,
+) -> torch.Tensor | BlockMask | None:
+    """
+    Create a sliding window causal mask based on the attention implementation used (stored in the config). This type
+    of attention pattern was mostly democratized by Mistral. If `past_key_values` has an hybrid cache structure, this
+    function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the
+    `modeling_xxx.py` files).
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        cache_position (`torch.Tensor`):
+            Deprecated and unused.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        position_ids (`torch.Tensor`, optional)
+            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
+        block_sequence_ids (`torch.Tensor`, *optional*):
+            A tensor of same shape as input IDs indicating to which block or group each token belongs to. Tokens from
+            the same block will keep a bidirectional mask within the block, attending causally to the past. Index `-1`
+            can be used for blocks that have to keep complete causality within itself.
+    """
+    # Power feature: if `is_causal` is False, then fallback to bi-directional mask for bi-directional attention
+    # It allows to use decoder-only models with bi-directional attention as well
+    if not getattr(config, "is_causal", True):
+        return create_bidirectional_sliding_window_mask(
+            config,
+            inputs_embeds,
+            attention_mask,
+            past_key_values=past_key_values,
+            or_mask_function=or_mask_function,
+            and_mask_function=and_mask_function,
+        )
+    # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
+    if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
+        layer_idx = past_key_values.is_sliding.index(True)
+    else:
+        layer_idx = 0
+    early_exit, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset = (
+        _preprocess_mask_arguments(config, inputs_embeds, attention_mask, past_key_values, position_ids, layer_idx)
+    )
+    if early_exit:
+        return attention_mask
+    sliding_window = getattr(config, "sliding_window", None)
+    if sliding_window is None:
+        raise ValueError("Could not find a `sliding_window` argument in the config, or it is not set")
+    batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
+    mask_factory_function = sliding_window_causal_mask_function(sliding_window)
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    # Defaulting to using non-vmap based mask creations except when detecting
+    # users passing custom mask functions (as we cannot guarantee that they
+    # are properly index-based as required by our implementation).
+    use_vmap = False
+    # Do not allow skip if we are compiling (this is to match BC)
+    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
+    allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
+    # Allow slight deviations from causal mask
+    # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
+    # padding mask, etc) as the resulting mask may otherwise not be correct!
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+        use_vmap = True
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
+        use_vmap = True
+    # If we detected packing format or blockwise overlay
+    if packed_sequence_mask is not None:
+        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
+        allow_is_causal_skip = False
+    if block_sequence_ids is not None:
+        block_sequence_ids = maybe_pad_block_sequence_ids(block_sequence_ids, attention_mask, kv_length, kv_offset)
+        mask_factory_function = or_masks(mask_factory_function, blockwise_overlay(block_sequence_ids))
+        allow_is_causal_skip = False
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        q_length=q_length,
+        kv_length=kv_length,
+        q_offset=q_offset,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        local_size=sliding_window,  # Additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
+        device=device,
+    )
+    return causal_mask
+def create_bidirectional_sliding_window_mask(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    encoder_hidden_states: torch.Tensor | None = None,
+    past_key_values: Cache | None = None,
+    or_mask_function: Callable | None = None,
+    and_mask_function: Callable | None = None,
+    **kwargs,
+) -> torch.Tensor | BlockMask | None:
+    """
+    Create a standard bidirectional sliding window mask based on the attention implementation used (stored in the config).
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is only used to infer metadata
+            such as the batch size, query length, dtype, and device.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, kv_length).
+            It can also be an already prepared 4D mask of shape (batch_size, 1, query_length, kv_length),
+            in which case it is returned as-is.
+        encoder_hidden_states (`torch.Tensor`, optional):
+            The input embeddings of shape (batch_size, kv_length, hidden_dim). If provided, it is used instead of
+            `inputs_embeds` to infer the batch size, kv length and dtype.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the base mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the base mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top, for example for image tokens handling.
+    """
+    # We ignore a few irrelevant arguments at the end as we do not have a (growing) cache here
+    early_exit, attention_mask, _, q_length, kv_length, q_offset, kv_offset = _preprocess_mask_arguments(
+        config, inputs_embeds, attention_mask, past_key_values, None, 0, encoder_hidden_states
+    )
+    if early_exit:
+        return attention_mask
+    sliding_window = getattr(config, "sliding_window", None)
+    if sliding_window is None:
+        raise ValueError("Could not find a `sliding_window` argument in the config, or it is not set")
+    embeds = encoder_hidden_states if encoder_hidden_states is not None else inputs_embeds
+    batch_size, dtype, device = embeds.shape[0], embeds.dtype, embeds.device
+    mask_factory_function = sliding_window_bidirectional_mask_function(sliding_window)
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    use_vmap = False
+    allow_is_bidirectional_skip = True
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_bidirectional_skip = False
+        use_vmap = True
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_bidirectional_skip = False
+        use_vmap = True
+    attention_mask = mask_interface(
+        batch_size=batch_size,
+        q_length=q_length,
+        kv_length=kv_length,
+        q_offset=q_offset,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=False,
+        allow_is_bidirectional_skip=allow_is_bidirectional_skip,
+        local_size=sliding_window,  # Additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
+        device=device,
+    )
+    return attention_mask
+def create_chunked_causal_mask(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    past_key_values: Cache | None,
+    position_ids: torch.Tensor | None = None,
+    or_mask_function: Callable | None = None,
+    and_mask_function: Callable | None = None,
+) -> torch.Tensor | BlockMask | None:
+    """
+    Create a chunked attention causal mask based on the attention implementation used (stored in the config). This type
+    of attention pattern was mostly democratized by Llama4. If `past_key_values` has an hybrid cache structure, this
+    function will return the mask corresponding to one of the "chunked_attention" layers (to align to what is needed in the
+    `modeling_xxx.py` files).
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        cache_position (`torch.Tensor`):
+            Deprecated and unused.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        position_ids (`torch.Tensor`, optional)
+            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the chunked causal mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the chunked causal mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
+    """
+    # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
+    if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
+        layer_idx = past_key_values.is_sliding.index(True)
+    else:
+        layer_idx = 0
+    early_exit, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset = (
+        _preprocess_mask_arguments(config, inputs_embeds, attention_mask, past_key_values, position_ids, layer_idx)
+    )
+    if early_exit:
+        return attention_mask
+    chunk_size = getattr(config, "attention_chunk_size", None)
+    if chunk_size is None:
+        raise ValueError("Could not find an `attention_chunk_size` argument in the config, or it is not set")
+    # Raise if using chunked attention on context too large with FA
+    if is_flash_attention_requested(config) and kv_length + kv_offset > chunk_size:
+        raise ValueError(
+            "Flash attention cannot handle chunked attention, and the key-value length is larger than the chunk size so the "
+            "chunked pattern cannot be respected. You should use another `attn_implementation` when instantiating the model"
+        )
+    batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
+    # For chunked attention and batched inputs, we need to take the number of left padding tokens into account
+    # to start the chunk from the actual start of the sequence for the padded sequence
+    if attention_mask is not None:
+        # Only count the left padding tokens, not all of them
+        left_padding_tokens = (attention_mask.cumsum(dim=-1) == torch.zeros_like(attention_mask)).sum(dim=-1)
+    else:
+        left_padding_tokens = torch.zeros(batch_size, device=device, dtype=int)
+    mask_factory_function = chunked_causal_mask_function(chunk_size, left_padding_tokens)
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    # Defaulting to using non-vmap based mask creations except when detecting
+    # users passing custom mask functions (as we cannot guarantee that they
+    # are properly index-based as required by our implementation).
+    use_vmap = False
+    # Do not allow skip if we are compiling (this is to match BC)
+    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
+    allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
+    # Allow slight deviations from causal mask
+    # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
+    # padding mask, etc) as the resulting mask may otherwise not be correct!
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+        use_vmap = True
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
+        use_vmap = True
+    # If we detected packing format
+    if packed_sequence_mask is not None:
+        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
+        allow_is_causal_skip = False
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        q_length=q_length,
+        kv_length=kv_length,
+        q_offset=q_offset,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        local_size=chunk_size,  # Additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
+        device=device,
+    )
+    return causal_mask
+LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING = {
+    "full_attention": create_causal_mask,
+    "sliding_attention": create_sliding_window_causal_mask,
+    "chunked_attention": create_chunked_causal_mask,
+    "compressed_sparse_attention": create_sliding_window_causal_mask,
+    "heavily_compressed_attention": create_sliding_window_causal_mask,
+}
+def create_masks_for_generate(
+    config: PreTrainedConfig,
+    inputs_embeds: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    past_key_values: Cache | None,
+    position_ids: torch.Tensor | None = None,
+    or_mask_function: Callable | None = None,
+    and_mask_function: Callable | None = None,
+    block_sequence_ids: torch.Tensor | None = None,
+    **kwargs,
+):
+    """
+    This function mimics how we create the masks in the `modeling_xxx.py` files, and is used in places like `generate`
+    in order to easily create the masks in advance, when we compile the forwards with Static caches.
+    Args:
+        config (`PreTrainedConfig`):
+            The model config.
+        inputs_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        position_ids (`torch.Tensor`, optional)
+            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the other mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the other mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+        block_sequence_ids (`torch.Tensor`, *optional*):
+            A tensor of same shape as input IDs indicating to which block or group each token belongs to. Tokens from
+            the same block will keep a bidirectional mask within the block, attending causally to the past. Index `-1`
+            can be used for blocks that have to keep complete causality within itself.
+    """
+    # The attribute reside in the text config for composite models
+    effective_config = config.get_text_config()
+    # Prepare the mask args
+    mask_kwargs = {
+        "config": effective_config,
+        "inputs_embeds": inputs_embeds,
+        "attention_mask": attention_mask,
+        "past_key_values": past_key_values,
+        "position_ids": position_ids,
+        "or_mask_function": or_mask_function,
+        "and_mask_function": and_mask_function,
+        "block_sequence_ids": block_sequence_ids,
+    }
+    # If the attribute exist, we need several masks
+    if hasattr(effective_config, "layer_types"):
+        causal_masks = {}
+        for layer_pattern in set(effective_config.layer_types):
+            causal_masks[layer_pattern] = LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING[layer_pattern](**mask_kwargs)
+        return causal_masks
+    # In this case, all layers are sliding
+    elif getattr(effective_config, "sliding_window", None) is not None:
+        return create_sliding_window_causal_mask(**mask_kwargs)
+    # In this case, all layers are chunked
+    elif getattr(effective_config, "attention_chunk_size", None) is not None:
+        return create_chunked_causal_mask(**mask_kwargs)
+    # All layers use standard causal attention
+    return create_causal_mask(**mask_kwargs)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/modeling_attn_mask_utils.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMPORTANT NOTICE: Every class and function in this file is deprecated in favor of using the much more general
+`masking_utils.py` primitives. New code should not rely on it, it is only kept for backward compatibility for now,
+and will be removed in the future.
+"""
+import warnings
+from dataclasses import dataclass
+from typing import Union
+import torch
+from .utils.import_utils import is_torchdynamo_compiling, is_tracing
+DEPRECATION_MESSAGE = (
+    "The attention mask API under `transformers.modeling_attn_mask_utils` (`AttentionMaskConverter`) "
+    "is deprecated and will be removed in Transformers v5.10. Please use the new API in `transformers.masking_utils`."
+)
+@dataclass
+class AttentionMaskConverter:
+    """
+    A utility attention mask class that allows one to:
+        - Create a causal 4d mask
+        - Create a causal 4d mask with slided window
+        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
+          key_value_length) that can be multiplied with attention scores
+    Examples:
+    ```python
+    >>> import torch
+    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    >>> converter = AttentionMaskConverter(True)
+    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
+    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
+    ```
+    Parameters:
+        is_causal (`bool`):
+            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.
+        sliding_window (`int`, *optional*):
+            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
+    """
+    is_causal: bool
+    sliding_window: int
+    def __init__(self, is_causal: bool, sliding_window: int | None = None):
+        warnings.warn(DEPRECATION_MESSAGE, FutureWarning)
+        self.is_causal = is_causal
+        self.sliding_window = sliding_window
+        if self.sliding_window is not None and self.sliding_window <= 0:
+            raise ValueError(
+                f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`"
+            )
+    def to_causal_4d(
+        self,
+        batch_size: int,
+        query_length: int,
+        key_value_length: int,
+        dtype: torch.dtype,
+        device: Union[torch.device, "str"] = "cpu",
+    ) -> torch.Tensor | None:
+        """
+        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
+        bias to upper right hand triangular matrix (causal mask).
+        """
+        if not self.is_causal:
+            raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.")
+        # If shape is not cached, create a new causal mask and cache it
+        input_shape = (batch_size, query_length)
+        past_key_values_length = key_value_length - query_length
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        causal_4d_mask = None
+        if input_shape[-1] > 1 or self.sliding_window is not None:
+            causal_4d_mask = self._make_causal_mask(
+                input_shape,
+                dtype,
+                device=device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=self.sliding_window,
+            )
+        return causal_4d_mask
+    def to_4d(
+        self,
+        attention_mask_2d: torch.Tensor,
+        query_length: int,
+        dtype: torch.dtype,
+        key_value_length: int | None = None,
+    ) -> torch.Tensor:
+        """
+        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
+        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
+        causal, a causal mask will be added.
+        """
+        input_shape = (attention_mask_2d.shape[0], query_length)
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        causal_4d_mask = None
+        if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
+            if key_value_length is None:
+                raise ValueError(
+                    "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask."
+                )
+            past_key_values_length = key_value_length - query_length
+            causal_4d_mask = self._make_causal_mask(
+                input_shape,
+                dtype,
+                device=attention_mask_2d.device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=self.sliding_window,
+            )
+        elif self.sliding_window is not None:
+            raise NotImplementedError("Sliding window is currently only implemented for causal masking")
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to(
+            attention_mask_2d.device
+        )
+        if causal_4d_mask is not None:
+            expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min)
+        # expanded_attn_mask + causal_4d_mask can cause some overflow
+        expanded_4d_mask = expanded_attn_mask
+        return expanded_4d_mask
+    @staticmethod
+    def _make_causal_mask(
+        input_ids_shape: torch.Size,
+        dtype: torch.dtype,
+        device: torch.device,
+        past_key_values_length: int = 0,
+        sliding_window: int | None = None,
+    ):
+        """
+        Make causal mask used for bi-directional self-attention.
+        """
+        warnings.warn(DEPRECATION_MESSAGE, FutureWarning)
+        bsz, tgt_len = input_ids_shape
+        mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+        mask_cond = torch.arange(mask.size(-1), device=device)
+        mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+        mask = mask.to(dtype)
+        if past_key_values_length > 0:
+            mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+        # add lower triangular sliding window mask if necessary
+        if sliding_window is not None:
+            diagonal = past_key_values_length - sliding_window - 1
+            context_mask = torch.tril(torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal)
+            # Recent changes in PyTorch prevent mutations on tensors converted with aten::_to_copy
+            # See https://github.com/pytorch/pytorch/issues/127571
+            if is_torchdynamo_compiling():
+                mask = mask.clone()
+            mask.masked_fill_(context_mask, torch.finfo(dtype).min)
+        return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+    @staticmethod
+    def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int | None = None):
+        """
+        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        warnings.warn(DEPRECATION_MESSAGE, FutureWarning)
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+        inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask
+        return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+    @staticmethod
+    def _unmask_unattended(
+        expanded_mask: torch.FloatTensor,
+        min_dtype: float,
+    ):
+        # fmt: off
+        """
+        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
+        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        Details: https://github.com/pytorch/pytorch/issues/110213
+        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
+        `attention_mask` is [bsz, src_seq_len].
+        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.
+        For example, if `expanded_mask` is (e.g. here left-padding case)
+        ```
+        [[[[0, 0, 0],
+           [0, 0, 0],
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[0, 0, 0],
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        then the modified `expanded_mask` will be
+        ```
+        [[[[1, 1, 1],   <-- modified
+           [1, 1, 1],   <-- modified
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[1, 1, 1],   <-- modified
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        """
+        warnings.warn(DEPRECATION_MESSAGE, FutureWarning)
+        # fmt: on
+        if expanded_mask.dtype == torch.bool:
+            raise ValueError(
+                "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor."
+            )
+        return expanded_mask.mul(~torch.all(expanded_mask == min_dtype, dim=-1, keepdim=True))
+    @staticmethod
+    def _ignore_causal_mask_sdpa(
+        attention_mask: torch.Tensor | None,
+        inputs_embeds: torch.Tensor,
+        past_key_values_length: int,
+        sliding_window: int | None = None,
+        is_training: bool = False,
+    ) -> bool:
+        """
+        Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
+        ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.
+        In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
+        `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
+        allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
+        passed).
+        """
+        warnings.warn(DEPRECATION_MESSAGE, FutureWarning)
+        _, query_length = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        key_value_length = query_length + past_key_values_length
+        is_tracing_ = is_tracing(inputs_embeds)
+        ignore_causal_mask = False
+        if attention_mask is None:
+            # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input
+            # shape, thus SDPA's `is_causal` argument is rightfully updated
+            # (see https://gist.github.com/fxmarty/1313f39037fc1c112508989628c57363). However, when using
+            # `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
+            # hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True`
+            # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
+            # Thus, we only set `ignore_causal_mask = True` if the model is set to training.
+            #
+            # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal`
+            # ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
+            if (
+                (is_training or not is_tracing_)
+                and (query_length == 1 or key_value_length == query_length)
+                and (sliding_window is None or key_value_length < sliding_window)
+            ):
+                ignore_causal_mask = True
+        elif sliding_window is None or key_value_length < sliding_window:
+            if len(attention_mask.shape) == 4:
+                return False
+            elif not is_tracing_ and torch.all(attention_mask == 1):
+                if query_length == 1 or key_value_length == query_length:
+                    # For query_length == 1, causal attention and bi-directional attention are the same.
+                    ignore_causal_mask = True
+                # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore
+                # the attention mask, as SDPA causal mask generation may be wrong. We will set `is_causal=False` in
+                # SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
+                # Reference: https://github.com/pytorch/pytorch/issues/108108
+                # TODO: maybe revisit this with https://github.com/pytorch/pytorch/pull/114823 in PyTorch 2.3.
+        return ignore_causal_mask
+def _prepare_4d_causal_attention_mask(
+    attention_mask: torch.Tensor | None,
+    input_shape: torch.Size | tuple | list,
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: int | None = None,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+    Args:
+        attention_mask (`torch.Tensor` or `None`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
+            The input shape should be a tuple that defines `(batch_size, query_length)`.
+        inputs_embeds (`torch.Tensor`):
+            The embedded inputs as a torch Tensor.
+        past_key_values_length (`int`):
+            The length of the key value cache.
+        sliding_window (`int`, *optional*):
+            If the model uses windowed attention, a sliding window should be passed.
+    """
+    attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
+    key_value_length = input_shape[-1] + past_key_values_length
+    # 4d mask is passed through the layers
+    if attention_mask is not None and len(attention_mask.shape) == 2:
+        attention_mask = attn_mask_converter.to_4d(
+            attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+            )
+    else:
+        attention_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+    return attention_mask
+# Adapted from _prepare_4d_causal_attention_mask
+def _prepare_4d_causal_attention_mask_for_sdpa(
+    attention_mask: torch.Tensor | None,
+    input_shape: torch.Size | tuple | list,
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: int | None = None,
+):
+    """
+    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.
+    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
+    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
+    """
+    attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
+    key_value_length = input_shape[-1] + past_key_values_length
+    # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
+    # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
+    # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+    is_tracing_ = is_tracing(inputs_embeds)
+    ignore_causal_mask = AttentionMaskConverter._ignore_causal_mask_sdpa(
+        attention_mask=attention_mask,
+        inputs_embeds=inputs_embeds,
+        past_key_values_length=past_key_values_length,
+        sliding_window=sliding_window,
+    )
+    if ignore_causal_mask:
+        expanded_4d_mask = None
+    elif attention_mask is None:
+        expanded_4d_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+    else:
+        if attention_mask.dim() == 4:
+            expanded_4d_mask = attention_mask
+        else:
+            expanded_4d_mask = attn_mask_converter.to_4d(
+                attention_mask,
+                input_shape[-1],
+                dtype=inputs_embeds.dtype,
+                key_value_length=key_value_length,
+            )
+        # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        if not is_tracing_ and expanded_4d_mask.device.type in ["cuda", "xpu"]:
+            expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
+                expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
+            )
+    return expanded_4d_mask
+def _prepare_4d_attention_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int | None = None):
+    """
+    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+    Args:
+        mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        dtype (`torch.dtype`):
+            The torch dtype the created mask shall have.
+        tgt_len (`int`):
+            The target length or query length the created mask shall have.
+    """
+    return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int | None = None):
+    """
+    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+    Args:
+        mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        dtype (`torch.dtype`):
+            The torch dtype the created mask shall have.
+        tgt_len (`int`):
+            The target length or query length the created mask shall have.
+    """
+    warnings.warn(DEPRECATION_MESSAGE, FutureWarning)
+    _, key_value_length = mask.shape
+    tgt_len = tgt_len if tgt_len is not None else key_value_length
+    # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture data-dependent controlflows.
+    if not is_tracing(mask) and torch.all(mask == 1):
+        return None
+    else:
+        return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+def _create_4d_causal_attention_mask(
+    input_shape: torch.Size | tuple | list,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+    sliding_window: int | None = None,
+) -> torch.Tensor | None:
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`
+    Args:
+        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
+            The input shape should be a tuple that defines `(batch_size, query_length)`.
+        dtype (`torch.dtype`):
+            The torch dtype the created mask shall have.
+        device (`int`):
+            The torch device the created mask shall have.
+        sliding_window (`int`, *optional*):
+            If the model uses windowed attention, a sliding window should be passed.
+    """
+    attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
+    key_value_length = past_key_values_length + input_shape[-1]
+    attention_mask = attn_mask_converter.to_causal_4d(
+        input_shape[0], input_shape[-1], key_value_length, dtype=dtype, device=device
+    )
+    return attention_mask

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/blt/configuration_blt.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Blt model configuration"""
+from huggingface_hub.dataclasses import strict
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring, logging
+logger = logging.get_logger(__name__)
+@auto_docstring(checkpoint="itazap/blt-1b-hf")
+@strict
+class BltLocalEncoderConfig(PreTrainedConfig):
+    r"""
+    cross_attn_all_layers (`bool`, *optional*, defaults to `True`):
+        Whether all attention layers have cross attention.
+    cross_attn_k (`int`, *optional*, defaults to 2):
+        Number of cross-attention heads used in the model.
+    hidden_size_global (`int`, *int*, defaults to 2048):
+        Hidden size of the global transformer layer.
+    """
+    model_type = "blt_local_encoder"
+    default_theta = 500000.0
+    vocab_size: int = 260
+    cross_attn_all_layers: bool | None = False
+    cross_attn_k: int | None = 2
+    hidden_size_global: int | None = 2048
+    hidden_size: int = 1024
+    num_attention_heads: int = 16
+    num_key_value_heads: int | None = None
+    num_hidden_layers: int = 1
+    rms_norm_eps: float = 1e-5
+    dropout: float | int | None = 0.0
+    max_position_embeddings: int = 24576
+    rope_parameters: RopeParameters | dict | None = None
+    hidden_act: str = "silu"
+    intermediate_size: int | None = None
+    initializer_range: float = 0.02
+    def __post_init__(self, **kwargs):
+        self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads
+        self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3)
+        self.tie_word_embeddings = False
+        super().__post_init__(**kwargs)
+@auto_docstring(checkpoint="itazap/blt-1b-hf")
+@strict
+class BltLocalDecoderConfig(PreTrainedConfig):
+    r"""
+    cross_attn_all_layers (`bool`, *optional*, defaults to `True`):
+        Whether all attention layers have cross attention.
+    cross_attn_k (`int`, *optional*, defaults to 2):
+        Number of cross-attention heads used in the model.
+    hidden_size_global (`int`, *int*, defaults to 2048):
+        Hidden size of the global transformer layer.
+    """
+    model_type = "blt_local_decoder"
+    default_theta = 500000.0
+    vocab_size: int = 260
+    cross_attn_all_layers: bool | None = True
+    cross_attn_k: int | None = 2
+    hidden_size_global: int | None = 2048
+    hidden_size: int = 1024
+    num_attention_heads: int = 16
+    num_key_value_heads: int | None = None
+    num_hidden_layers: int = 9
+    rms_norm_eps: float = 1e-5
+    dropout: float | int | None = 0.0
+    max_position_embeddings: int = 24576
+    rope_parameters: RopeParameters | dict | None = None
+    hidden_act: str = "silu"
+    intermediate_size: int = 2816
+    initializer_range: float = 0.02
+    pad_token_id: int | None = None
+    bos_token_id: int | None = None
+    eos_token_id: int | list[int] | None = None
+    tie_word_embeddings: bool = False
+    def __post_init__(self, **kwargs):
+        self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3)
+        self.tie_word_embeddings = False  # Force-set to False for BC
+        super().__post_init__(**kwargs)
+@auto_docstring(checkpoint="itazap/blt-1b-hf")
+@strict
+class BltGlobalTransformerConfig(PreTrainedConfig):
+    model_type = "blt_global_transformer"
+    default_theta = 500000.0
+    hidden_size: int = 2048
+    num_attention_heads: int = 16
+    num_key_value_heads: int | None = None
+    num_hidden_layers: int = 25
+    rms_norm_eps: float = 1e-5
+    dropout: float | int | None = 0.0
+    max_position_embeddings: int = 4096
+    rope_parameters: RopeParameters | dict | None = None
+    hidden_act: str = "silu"
+    intermediate_size: int = 5632
+    initializer_range: float = 0.02
+    tie_word_embeddings: bool = False
+    def __post_init__(self, **kwargs):
+        self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3)
+        self.tie_word_embeddings = False
+        super().__post_init__(**kwargs)
+@auto_docstring(checkpoint="itazap/blt-1b-hf")
+@strict
+class BltPatcherConfig(PreTrainedConfig):
+    model_type = "blt_patcher"
+    vocab_size: int = 260
+    hidden_size: int = 768
+    num_hidden_layers: int = 14
+    num_attention_heads: int = 12
+    num_key_value_heads: int | None = None
+    max_position_embeddings: int = 8192
+    rms_norm_eps: float = 1e-5
+    dropout: float | int | None = 0.0
+    intermediate_size: int = 2048
+    rope_parameters: RopeParameters | dict | None = None
+    initializer_range: float = 0.02
+    tie_word_embeddings: bool = False
+    def __post_init__(self, **kwargs):
+        self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3)
+        self.tie_word_embeddings = False
+        self.hidden_act = "silu"  # Blt uses silu activation
+        super().__post_init__(**kwargs)
+@auto_docstring(checkpoint="itazap/blt-1b-hf")
+@strict
+class BltConfig(PreTrainedConfig):
+    r"""
+    patch_in_forward (`bool`, *optional*, defaults to `True`):
+        Whether to perform patching during the forward pass.
+    patch_size (`int`, *optional*, defaults to 4):
+        Size of the patches used in the patching mechanism.
+    patching_mode (`str`, *optional*, defaults to `"entropy"`):
+        The mode used for patching, such as entropy-based patching.
+    patching_threshold (`float`, *optional*, defaults to 1.34):
+        Threshold value used for determining when to apply patches.
+    patching_batch_size (`int`, *optional*, defaults to 1):
+        Batch size used during the patching process.
+    max_patch_length (`int`, *optional*):
+        Maximum length of patches that can be generated.
+    cross_attn_k (`int`, *optional*, defaults to 2):
+        Number of cross-attention heads used in the model.
+    encoder_hash_byte_group_size (`list`, *optional*):
+        List of byte group sizes used in the encoder hash function.
+    encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002):
+        Vocabulary size for the encoder hash byte groups.
+    encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1):
+        Number of hash functions used in the encoder byte grouping.
+    patcher_config (`BltPatcherConfig`, *optional*):
+        Configuration for the patcher component of the model.
+    global_config (`BltGlobalTransformerConfig`, *optional*):
+        Configuration for the global transformer component of the model.
+    Example:
+    ```python
+    >>> from transformers import BltModel, BltConfig
+    >>> # Initializing a Blt configuration
+    >>> configuration = BltConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = BltModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "blt"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    default_theta = 500000.0
+    sub_configs = {
+        "patcher_config": BltPatcherConfig,
+        "encoder_config": BltLocalEncoderConfig,
+        "decoder_config": BltLocalDecoderConfig,
+        "global_config": BltGlobalTransformerConfig,
+    }
+    vocab_size: int = 260
+    max_position_embeddings: int = 4096
+    patch_in_forward: bool | None = True
+    patch_size: int | None = 4
+    patching_mode: str | None = "entropy"
+    patching_threshold: float | None = 1.335442066192627
+    patching_batch_size: int | None = 1
+    max_patch_length: int | None = None
+    cross_attn_k: int | None = 2
+    encoder_hash_byte_group_size: list[int] | None = None
+    encoder_hash_byte_group_vocab: int | None = 500002
+    encoder_hash_byte_group_nb_functions: int | None = 1
+    patcher_config: dict | PreTrainedConfig | None = None
+    encoder_config: dict | PreTrainedConfig | None = None
+    decoder_config: dict | PreTrainedConfig | None = None
+    global_config: dict | PreTrainedConfig | None = None
+    tie_word_embeddings: bool = False
+    pad_token_id: int | None = None
+    bos_token_id: int | None = None
+    eos_token_id: int | list[int] | None = None
+    initializer_range: float = 0.02
+    rope_parameters: RopeParameters | dict | None = None
+    def __post_init__(self, **kwargs):
+        self.encoder_hash_byte_group_size = self.encoder_hash_byte_group_size or [3, 4, 5, 6, 7, 8]
+        # Initialize component configurations
+        if self.patcher_config is None:
+            self.patcher_config = BltPatcherConfig(initializer_range=self.initializer_range)
+            logger.info("patcher_config is None, using default Blt patcher config")
+        elif isinstance(self.patcher_config, dict):
+            self.patcher_config.setdefault("initializer_range", self.initializer_range)
+            self.patcher_config = BltPatcherConfig(**self.patcher_config)
+        if self.encoder_config is None:
+            self.encoder_config = BltLocalEncoderConfig(initializer_range=self.initializer_range)
+            logger.info("encoder_config is None, using default Blt encoder config")
+        elif isinstance(self.encoder_config, dict):
+            self.encoder_config.setdefault("initializer_range", self.initializer_range)
+            self.encoder_config = BltLocalEncoderConfig(**self.encoder_config)
+        if self.decoder_config is None:
+            self.decoder_config = BltLocalDecoderConfig(initializer_range=self.initializer_range)
+            logger.info("decoder_config is None, using default Blt decoder config")
+        elif isinstance(self.decoder_config, dict):
+            self.decoder_config.setdefault("initializer_range", self.initializer_range)
+            self.decoder_config = BltLocalDecoderConfig(**self.decoder_config)
+        if self.global_config is None:
+            self.global_config = BltGlobalTransformerConfig(initializer_range=self.initializer_range)
+            logger.info("global_config is None, using default Blt global config")
+        elif isinstance(self.global_config, dict):
+            self.global_config.setdefault("initializer_range", self.initializer_range)
+            self.global_config = BltGlobalTransformerConfig(**self.global_config)
+        # Determine if token embedding projection is needed based on dimension mismatch (7b)
+        encoder_cross_output_size = self.encoder_config.hidden_size * self.cross_attn_k
+        self.global_config.encoder_cross_output_size = (
+            encoder_cross_output_size if encoder_cross_output_size != self.global_config.hidden_size else None
+        )
+        super().__post_init__(**kwargs)
+__all__ = [
+    "BltConfig",
+    "BltPatcherConfig",
+    "BltLocalEncoderConfig",
+    "BltLocalDecoderConfig",
+    "BltGlobalTransformerConfig",
+]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/jetmoe/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_jetmoe import *
+    from .modeling_jetmoe import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/jetmoe/modeling_jetmoe.py ADDED Viewed

	@@ -0,0 +1,830 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/jetmoe/modular_jetmoe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_jetmoe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2024 JetMoe AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+from typing import Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ... import initialization as init
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import maybe_autocast, merge_with_config_defaults
+from ...utils.output_capturing import OutputRecorder, capture_outputs
+from .configuration_jetmoe import JetMoeConfig
+logger = logging.get_logger(__name__)
+@use_kernel_forward_from_hub("RMSNorm")
+class JetMoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        JetMoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class JetMoeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: JetMoeConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: JetMoeConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class JetMoeParallelExperts(nn.Module):
+    def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
+        """
+        Initialize the JetMoeParallelExperts module.
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
+        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
+        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
+        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
+        used in vllm.
+        Args:
+            num_experts (int):
+                Number of experts.
+            input_size (int):
+                Size of the input.
+            output_size (int):
+                Size of the output.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+    def forward(self, inputs, expert_size):
+        """
+        Forward pass of the JetMoeParallelExperts module.
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        for i in range(self.num_experts):
+            output_list.append(F.linear(input_list[i], self.weight[i]))
+        results = torch.cat(output_list, dim=0)
+        return results
+class JetMoeTopKGating(nn.Module):
+    def __init__(self, input_size: int, num_experts: int, top_k: int):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (`int`):
+                Size of the input.
+            num_experts (`int`):
+                Number of experts.
+            top_k (`int`):
+                Number of top experts to select.
+        """
+        super().__init__()
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.top_k = top_k
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+    def forward(self, hidden_states):
+        # compute the top_k routing decision
+        logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+        # compute number of input given to each expert
+        zeros = torch.zeros(
+            [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+        )  # [num_tokens, num_experts]
+        gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+        expert_size = gates.long().sum(0)  # [num_experts,]
+        # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
+        # (and `DataDependentOutputException`)
+        expert_size = expert_size.tolist()
+        # sort and group input tokens according to expert assignment
+        top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+        _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+        batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+        # gather the gate values for grouped input tokens
+        top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+        batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+        return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+class JetMoeMoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+    def __init__(self, config: JetMoeConfig):
+        super().__init__()
+        self.input_size = config.hidden_size
+        self.hidden_size = config.intermediate_size
+        self.activation = ACT2FN[config.activation_function]
+        self.bias = torch.nn.Parameter(torch.empty(self.input_size))
+        self.input_linear = JetMoeParallelExperts(config.num_local_experts, self.input_size, self.hidden_size * 2)
+        self.output_linear = JetMoeParallelExperts(config.num_local_experts, self.hidden_size, self.input_size)
+        self.router = JetMoeTopKGating(
+            input_size=self.input_size,
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+        )
+    def forward(self, layer_input):
+        """
+        Forward pass of the mixture of experts layer.
+        Args:
+            layer_input (Tensor):
+                Input tensor.
+        Returns:
+            Tensor:
+                Output tensor.
+            Tensor:
+                Router logits.
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+        expert_inputs = layer_input[batch_index]
+        hidden_states = self.input_linear(expert_inputs, expert_size)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        expert_outputs = self.output_linear(hidden_states, expert_size)
+        expert_outputs = expert_outputs * batch_gates[:, None]
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        layer_output = layer_output + self.bias
+        return layer_output
+class JetMoeMoA(nn.Module):
+    """
+    A Sparsely gated mixture of attention layer with pairs of query- and output-projections as experts.
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+    def __init__(self, config: JetMoeConfig):
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.input_size = config.hidden_size
+        self.hidden_size = config.kv_channels * config.num_key_value_heads
+        self.top_k = config.num_experts_per_tok
+        self.bias = torch.nn.Parameter(torch.empty(self.input_size))
+        self.input_linear = JetMoeParallelExperts(self.num_experts, self.input_size, self.hidden_size)
+        self.output_linear = JetMoeParallelExperts(self.num_experts, self.hidden_size, self.input_size)
+        self.router = JetMoeTopKGating(
+            input_size=self.input_size,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+        )
+    def map(self, layer_input):
+        """
+        Map inputs to attention experts according to routing decision and compute query projection inside each experts.
+        """
+        # Compute gating topology
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)  # [bsz * length, emb_size]
+        index_sorted_experts, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+        topo_info = (index_sorted_experts, batch_index, batch_gates, expert_size)
+        # Group inputs according to topology and compute query projection
+        expert_inputs = layer_input[batch_index]  # [bsz * length * top_k, emb_size]
+        expert_outputs = self.input_linear(expert_inputs, expert_size)  # [bsz * length * top_k, hidden_size]
+        # Ungroup queries back to original order
+        zeros = torch.zeros(
+            (bsz * length * self.top_k, self.hidden_size), dtype=expert_outputs.dtype, device=expert_outputs.device
+        )
+        layer_output = zeros.index_add(0, index_sorted_experts, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.top_k, -1)  # [bsz, length, top_k, hidden_size]
+        return layer_output, router_logits, topo_info
+    def reduce(self, layer_input, topo_info):
+        """
+        Compute output projection inside each attention experts and merge the outputs of different experts.
+        """
+        bsz, length, k, hidden_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, hidden_size)  # [bsz * length * k, hidden_size]
+        index_sorted_experts, batch_index, batch_gates, expert_size = topo_info
+        # Group inputs according to topology and compute output projection
+        expert_inputs = layer_input[index_sorted_experts]  # [bsz * length * top_k, hidden_size]
+        expert_outputs = self.output_linear(expert_inputs, expert_size)  # [bsz * length * top_k, emb_size]
+        # Apply gates to attention expert outputs
+        expert_outputs = expert_outputs * batch_gates[:, None]
+        # Ungroup and merge outputs to original order
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        layer_output = layer_output + self.bias
+        return layer_output
+    def forward(self, layer_input):
+        raise NotImplementedError("This module doesn't support call and forward.")
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class JetMoeAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    """
+    def __init__(self, config: JetMoeConfig, layer_idx: int | None = None):
+        """
+        Initialize the JetMoeAttention module.
+        Args:
+            config:
+                Configuration object with model hyperparameters.
+            layer_idx:
+                Index of the layer in the model.
+        """
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.is_causal = True
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.num_key_value_groups = 1  # We ignore this by setting it to 1 as we have different repeat patterns
+        self.top_k = config.num_experts_per_tok
+        self.attention_dropout = config.attention_dropout
+        self.kv_projection_size = config.kv_channels * config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.kv_channels
+        self.scaling = self.head_dim**-0.5
+        self.experts = JetMoeMoA(config)
+        self.kv_proj = torch.nn.Linear(config.hidden_size, self.kv_projection_size * 2, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states, router_logits, topo_info = self.experts.map(hidden_states)
+        key_states, value_states = self.kv_proj(hidden_states).chunk(2, dim=-1)
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        # This is different from other models where we repeat k/v heads
+        # instead of repeat interleaving them
+        key_states = key_states.repeat(1, self.top_k, 1, 1)
+        value_states = value_states.repeat(1, self.top_k, 1, 1)
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.view(*input_shape, self.top_k, -1)
+        attn_output = self.experts.reduce(attn_output, topo_info)
+        attn_output = attn_output.view(*input_shape, -1)
+        return attn_output, attn_weights, router_logits
+class JetMoeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: JetMoeConfig, layer_idx: int | None = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.mlp = JetMoeMoE(config)
+        self.input_layernorm = JetMoeRMSNorm(config.hidden_size)
+        self.post_attention_layernorm = JetMoeRMSNorm(config.hidden_size)
+        self.self_attention = JetMoeAttention(config, layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _, _ = self.self_attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class JetMoePreTrainedModel(PreTrainedModel):
+    config: JetMoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["JetMoeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False  # TopK gating fails fullgraph compilation at "expert_size = expert_size.tolist()"
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "router_logits": [OutputRecorder(JetMoeAttention, index=2), OutputRecorder(JetMoeTopKGating, index=4)],
+        "hidden_states": JetMoeDecoderLayer,
+        "attentions": OutputRecorder(JetMoeAttention, index=1),
+    }
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        super()._init_weights(module)
+        if isinstance(module, JetMoeParallelExperts):
+            init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, JetMoeMoA | JetMoeMoE):
+            init.zeros_(module.bias)
+@auto_docstring
+class JetMoeModel(JetMoePreTrainedModel):
+    def __init__(self, config: JetMoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [JetMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = JetMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = JetMoeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self._attn_implementation = config._attn_implementation
+        # Initialize weights and apply final processing
+        self.post_init()
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                position_ids=position_ids,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return MoeModelOutputWithPast(  # only diff with Mistral is the output type, we need MoE
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+def load_balancing_loss_func(
+    gate_logits: torch.Tensor | tuple[torch.Tensor] | None,
+    num_experts: int | None = None,
+    top_k=2,
+    attention_mask: torch.Tensor | None = None,
+) -> torch.Tensor | int:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+class JetMoeForCausalLM(JetMoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = JetMoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.aux_loss_coef = config.aux_loss_coef
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.tie_word_embeddings = config.tie_word_embeddings
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        output_router_logits: bool | None = False,
+        **kwargs,
+    ) -> MoeCausalLMOutputWithPast:
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_router_logits=output_router_logits,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+class JetMoeForSequenceClassification(GenericForSequenceClassification, JetMoePreTrainedModel): ...
+__all__ = ["JetMoeForCausalLM", "JetMoeModel", "JetMoePreTrainedModel", "JetMoeForSequenceClassification"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/vitmatte/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_vitmatte import *
+    from .image_processing_pil_vitmatte import *
+    from .image_processing_vitmatte import *
+    from .modeling_vitmatte import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/vitmatte/image_processing_pil_vitmatte.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViTMatte."""
+import numpy as np
+from ...image_processing_backends import PilBackend
+from ...image_processing_utils import BatchFeature
+from ...image_transforms import PaddingMode
+from ...image_transforms import pad as np_pad
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+)
+from ...processing_utils import ImagesKwargs, Unpack
+from ...utils import TensorType, auto_docstring, is_torch_available
+if is_torch_available():
+    pass
+# Adapted from transformers.models.vitmatte.image_processing_vitmatte.VitMatteImageProcessorKwargs
+class VitMatteImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+        The width and height of the image will be padded to be divisible by this number.
+    """
+    size_divisor: int
+@auto_docstring
+class VitMatteImageProcessorPil(PilBackend):
+    do_rescale = True
+    rescale_factor = 1 / 255
+    do_normalize = True
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_pad = True
+    size_divisor = 32
+    valid_kwargs = VitMatteImageProcessorKwargs
+    def __init__(self, **kwargs: Unpack[VitMatteImageProcessorKwargs]) -> None:
+        size_divisibility = kwargs.pop("size_divisibility", None)
+        if size_divisibility is not None:
+            kwargs.setdefault("size_divisor", size_divisibility)
+        super().__init__(**kwargs)
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size_divisor: int = 32,
+    ) -> np.ndarray:
+        """
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size_divisor (`int`, *optional*, defaults to 32):
+                The width and height of the image will be padded to be divisible by this number.
+        """
+        height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+        pad_height = 0 if height % size_divisor == 0 else size_divisor - height % size_divisor
+        pad_width = 0 if width % size_divisor == 0 else size_divisor - width % size_divisor
+        if pad_width + pad_height > 0:
+            padding = ((0, pad_height), (0, pad_width))
+            image = np_pad(
+                image,
+                padding=padding,
+                mode=PaddingMode.CONSTANT,
+                constant_values=0,
+                data_format=ChannelDimension.FIRST,
+                input_data_format=ChannelDimension.FIRST,
+            )
+        return image
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        trimaps: ImageInput,
+        **kwargs: Unpack[VitMatteImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        trimaps (`ImageInput`):
+            The trimaps to preprocess.
+        """
+        return super().preprocess(images, trimaps, **kwargs)
+    def _preprocess_image_like_inputs(
+        self,
+        images: ImageInput,
+        trimaps: ImageInput,
+        do_convert_rgb: bool,
+        input_data_format: ChannelDimension,
+        device: str | None = None,
+        **kwargs: Unpack[VitMatteImageProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Preprocess image-like inputs.
+        """
+        images = self._prepare_image_like_inputs(
+            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+        )
+        trimaps = self._prepare_image_like_inputs(images=trimaps, expected_ndims=2, device=device)
+        return self._preprocess(images, trimaps, **kwargs)
+    def _preprocess(
+        self,
+        images: list[np.ndarray],
+        trimaps: list[np.ndarray],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        do_pad: bool | None,
+        size_divisor: int | None,
+        return_tensors: str | TensorType | None,
+        **kwargs,
+    ) -> BatchFeature:
+        processed_images = []
+        for image, trimap in zip(images, trimaps):
+            if do_rescale:
+                image = self.rescale(image, rescale_factor)
+                trimap = self.rescale(trimap, rescale_factor)
+            if do_normalize:
+                image = self.normalize(image, image_mean, image_std)
+            # Concatenate images and trimaps along channel dimension
+            # trimap is already (1, H, W) from _prepare_image_like_inputs with expected_ndims=2
+            if trimap.ndim == 3 and trimap.shape[0] == 1:
+                image = np.concatenate([image, trimap], axis=0)
+            else:
+                image = np.concatenate([image, np.expand_dims(trimap, axis=0)], axis=0)
+            if do_pad:
+                image = self.pad_image(image, size_divisor)
+            processed_images.append(image)
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+__all__ = ["VitMatteImageProcessorPil"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/optimization.py ADDED Viewed

	@@ -0,0 +1,1342 @@

+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+from __future__ import annotations
+import math
+import warnings
+from functools import partial
+from typing import Any
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
+from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler
+from .trainer_utils import SchedulerType
+from .utils import logging
+logger = logging.get_logger(__name__)
+def _get_constant_lambda(_=None):
+    return 1
+def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
+def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs):
+    """
+    Create a schedule with a constant learning rate that decreases when a metric has stopped improving.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        kwargs (`dict`, *optional*):
+            Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau`
+            for possible parameters.
+    Return:
+        `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
+    """
+    return ReduceLROnPlateau(optimizer, **kwargs)
+def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1.0, num_warmup_steps))
+    return 1.0
+def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
+    """
+    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
+    increases linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps)
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_lambda = partial(
+        _get_linear_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def _get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+def get_cosine_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_lambda = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(
+    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    if progress >= 1.0:
+        return 0.0
+    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
+    linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`int`, *optional*, defaults to 1):
+            The number of hard restarts to use.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_lambda = partial(
+        _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def _get_polynomial_decay_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    lr_end: float,
+    power: float,
+    lr_init: int,
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    elif current_step > num_training_steps:
+        return lr_end / lr_init  # as LambdaLR multiplies by lr_init
+    else:
+        lr_range = lr_init - lr_end
+        decay_steps = num_training_steps - num_warmup_steps
+        pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+        decay = lr_range * pct_remaining**power + lr_end
+        return decay / lr_init  # as LambdaLR multiplies by lr_init
+def get_polynomial_decay_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
+):
+    """
+    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
+    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        lr_end (`float`, *optional*, defaults to 1e-7):
+            The end LR.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+    implementation at
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_init = optimizer.defaults["lr"]
+    if not (lr_init > lr_end):
+        raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})")
+    lr_lambda = partial(
+        _get_polynomial_decay_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        lr_end=lr_end,
+        power=power,
+        lr_init=lr_init,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int | None = None):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    shift = timescale - num_warmup_steps
+    decay = 1.0 / math.sqrt((current_step + shift) / timescale)
+    return decay
+def get_inverse_sqrt_schedule(
+    optimizer: Optimizer, num_warmup_steps: int, timescale: int | None = None, last_epoch: int = -1
+):
+    """
+    Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
+    warmup period which increases lr linearly from 0 to the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        timescale (`int`, *optional*, defaults to `num_warmup_steps`):
+            Time scale.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    # Note: this implementation is adapted from
+    # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930
+    if timescale is None:
+        timescale = num_warmup_steps or 10_000
+    lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale)
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+def _get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_rate: float = 0.0
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    factor = factor * (1 - min_lr_rate) + min_lr_rate
+    return max(0, factor)
+def get_cosine_with_min_lr_schedule_with_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+    min_lr: float | None = None,
+    min_lr_rate: float | None = None,
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+        min_lr (`float`, *optional*):
+            The minimum learning rate to reach after the cosine schedule.
+        min_lr_rate (`float`, *optional*):
+            The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    if min_lr is not None and min_lr_rate is not None:
+        raise ValueError("Only one of min_lr or min_lr_rate should be set")
+    elif min_lr is not None:
+        min_lr_rate = min_lr / optimizer.defaults["lr"]
+    elif min_lr_rate is None:
+        raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`")
+    lr_lambda = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+        min_lr_rate=min_lr_rate,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float,
+    min_lr_rate: float = 0.0,
+    warmup_lr_rate: float | None = None,
+):
+    current_step = float(current_step)
+    num_warmup_steps = float(num_warmup_steps)
+    num_training_steps = float(num_training_steps)
+    if current_step < num_warmup_steps:
+        if warmup_lr_rate is None:
+            return (current_step + 1.0) / max(1.0, num_warmup_steps)
+        else:
+            warmup_lr_rate = float(warmup_lr_rate)
+            return warmup_lr_rate + (1.0 - warmup_lr_rate) * (current_step) / (max(1, num_warmup_steps - 1))
+    progress = (current_step - num_warmup_steps + 1.0) / (max(1.0, num_training_steps - num_warmup_steps))
+    factor = 0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress))
+    factor = factor * (1 - min_lr_rate) + min_lr_rate
+    return max(0, factor)
+def get_cosine_with_min_lr_schedule_with_warmup_lr_rate(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+    min_lr: float | None = None,
+    min_lr_rate: float | None = None,
+    warmup_lr_rate: float | None = None,
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+        min_lr (`float`, *optional*):
+            The minimum learning rate to reach after the cosine schedule.
+        min_lr_rate (`float`, *optional*):
+            The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set.
+        warmup_lr_rate (`float`, *optional*):
+            The minimum learning rate as a ratio of the start learning rate. If not set, `warmup_lr_rate` will be treated as float(1/num_warmup_steps).
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    if min_lr is not None and min_lr_rate is not None:
+        raise ValueError("Only one of min_lr or min_lr_rate should be set")
+    elif min_lr is not None:
+        min_lr_rate = min_lr / optimizer.defaults["lr"]
+    elif min_lr_rate is None:
+        raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`")
+    lr_lambda = partial(
+        _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+        min_lr_rate=min_lr_rate,
+        warmup_lr_rate=warmup_lr_rate,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def _get_wsd_scheduler_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_decay_steps: int,
+    warmup_type: str,
+    decay_type: str,
+    min_lr_ratio: float,
+    num_cycles: float,
+):
+    if current_step < num_warmup_steps:
+        progress = float(current_step) / float(max(1, num_warmup_steps))
+        if warmup_type == "linear":
+            factor = progress
+        elif warmup_type == "cosine":
+            factor = 0.5 * (1.0 - math.cos(math.pi * progress))
+        elif warmup_type == "1-sqrt":
+            factor = 1.0 - math.sqrt(1.0 - progress)
+        factor = factor * (1.0 - min_lr_ratio) + min_lr_ratio
+        return max(0.0, factor)
+    if current_step < num_warmup_steps + num_stable_steps:
+        return 1.0
+    if current_step < num_warmup_steps + num_stable_steps + num_decay_steps:
+        progress = float(current_step - num_warmup_steps - num_stable_steps) / float(max(1, num_decay_steps))
+        if decay_type == "linear":
+            factor = 1.0 - progress
+        elif decay_type == "cosine":
+            factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+        elif decay_type == "1-sqrt":
+            factor = 1.0 - math.sqrt(progress)
+        factor = factor * (1.0 - min_lr_ratio) + min_lr_ratio
+        return max(0.0, factor)
+    return min_lr_ratio
+def get_wsd_schedule(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_decay_steps: int,
+    num_training_steps: int | None = None,
+    num_stable_steps: int | None = None,
+    warmup_type: str = "linear",
+    decay_type: str = "cosine",
+    min_lr_ratio: float = 0,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+):
+    """
+    Create a schedule with a learning rate that has three stages:
+    1. warmup: increase from min_lr_ratio times the initial learning rate to the initial learning rate following a warmup_type.
+    2. stable: constant learning rate.
+    3. decay: decrease from the initial learning rate to min_lr_ratio times the initial learning rate following a decay_type.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_decay_steps (`int`):
+            The number of steps for the decay phase.
+        num_training_steps (`int`, *optional*):
+            The total number of training steps. This is the sum of the warmup, stable and decay steps. If `num_stable_steps` is not provided, the stable phase will be `num_training_steps - num_warmup_steps - num_decay_steps`.
+        num_stable_steps (`int`, *optional*):
+            The number of steps for the stable phase. Please ensure that `num_warmup_steps + num_stable_steps + num_decay_steps` equals `num_training_steps`, otherwise the other steps will default to the minimum learning rate.
+        warmup_type (`str`, *optional*, defaults to "linear"):
+            The type of warmup to use. Can be 'linear', 'cosine' or '1-sqrt'.
+        decay_type (`str`, *optional*, defaults to "cosine"):
+            The type of decay to use. Can be 'linear', 'cosine' or '1-sqrt'.
+        min_lr_ratio (`float`, *optional*, defaults to 0):
+            The minimum learning rate as a ratio of the initial learning rate.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    if num_training_steps is None and num_stable_steps is None:
+        raise ValueError("Either num_training_steps or num_stable_steps must be specified.")
+    if num_training_steps is not None and num_stable_steps is not None:
+        warnings.warn("Both num_training_steps and num_stable_steps are specified. num_stable_steps will be used.")
+    if warmup_type not in ["linear", "cosine", "1-sqrt"]:
+        raise ValueError(f"Unknown warmup type: {warmup_type}, expected 'linear', 'cosine' or '1-sqrt'")
+    if decay_type not in ["linear", "cosine", "1-sqrt"]:
+        raise ValueError(f"Unknown decay type: {decay_type}, expected 'linear', 'cosine' or '1-sqrt'")
+    if num_stable_steps is None:
+        num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps
+    lr_lambda = partial(
+        _get_wsd_scheduler_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_stable_steps=num_stable_steps,
+        num_decay_steps=num_decay_steps,
+        warmup_type=warmup_type,
+        decay_type=decay_type,
+        min_lr_ratio=min_lr_ratio,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+class StreamingAverage:
+    """Rolling window average for smoothing metric values.
+    Maintains a sliding window of values and computes their average,
+    useful for smoothing noisy metric values before making learning rate decisions.
+    Args:
+        window_size (`int`):
+            The maximum number of values to keep in the rolling window.
+    """
+    def __init__(self, window_size: int) -> None:
+        self.window_size: int = window_size
+        self.values: list[float] = []
+        self.sum: float = 0.0
+    def streamavg(self, value: float) -> float:
+        """Add a value and return the current rolling average."""
+        self.values.append(value)
+        self.sum += value
+        if len(self.values) > self.window_size:
+            removed = self.values.pop(0)
+            self.sum -= removed
+        return self.sum / len(self.values)
+    def state_dict(self) -> dict[str, Any]:
+        return {
+            "window_size": self.window_size,
+            "values": self.values.copy(),
+            "sum": self.sum,
+        }
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        self.window_size = state_dict.get("window_size", self.window_size)
+        self.values = state_dict.get("values", []).copy()
+        self.sum = state_dict.get("sum", 0.0)
+class GreedyLR:
+    """Adaptive learning rate scheduler that responds to training metrics.
+    GreedyLR dynamically adjusts the learning rate based on training performance:
+    - Increases LR when metrics improve consistently (divides by factor)
+    - Decreases LR when metrics plateau (multiplies by factor)
+    This differs from traditional schedulers like cosine annealing by responding
+    to actual training dynamics rather than following a predetermined schedule.
+    Reference: `GreedyLR: A Novel Adaptive Learning Rate Scheduler <https://arxiv.org/abs/2512.14527>`_
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        mode (`str`, *optional*, defaults to `"min"`):
+            One of 'min' or 'max'. In 'min' mode, LR will be reduced when the
+            metric has stopped decreasing; in 'max' mode when it has stopped increasing.
+        factor (`float`, *optional*, defaults to 0.95):
+            Factor by which the learning rate will be adjusted. LR is multiplied by
+            factor on plateau and divided by factor on improvement. Must be < 1.0.
+        patience (`int`, *optional*, defaults to 10):
+            Number of epochs with no improvement after which learning rate will be adjusted.
+        threshold (`float`, *optional*, defaults to 1e-06):
+            Threshold for measuring the new optimum.
+        threshold_mode (`str`, *optional*, defaults to `"abs"`):
+            One of 'rel' or 'abs'.
+        cooldown (`int`, *optional*, defaults to 0):
+            Number of epochs to wait before resuming normal operation after LR has been reduced.
+        warmup (`int`, *optional*, defaults to 0):
+            Number of epochs to wait before resuming normal operation after LR has been increased.
+        min_lr (`float` or `list[float]`, *optional*, defaults to 0.001):
+            A lower bound on the learning rate.
+        max_lr (`float` or `list[float]`, *optional*, defaults to 1.0):
+            An upper bound on the learning rate.
+        eps (`float`, *optional*, defaults to 1e-08):
+            Minimal decay applied to lr.
+        verbose (`bool`, *optional*, defaults to `False`):
+            If True, prints a message to stdout for each update.
+        smooth (`bool`, *optional*, defaults to `False`):
+            If True, applies streaming average smoothing to metrics.
+        window_size (`int`, *optional*, defaults to 50):
+            The window size for the streaming average when smooth=True.
+        reset_start (`int`, *optional*, defaults to 500):
+            Number of steps to wait at min_lr before resetting to initial state.
+    Example:
+        ```python
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+        >>> scheduler = GreedyLR(optimizer, mode="min", patience=10)
+        >>> for epoch in range(100):
+        ...     train(...)
+        ...     val_loss = validate(...)
+        ...     scheduler.step(val_loss)
+        ```
+    """
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        mode: str = "min",
+        factor: float = 0.95,
+        patience: int = 10,
+        threshold: float = 1e-6,
+        threshold_mode: str = "abs",
+        cooldown: int = 0,
+        warmup: int = 0,
+        min_lr: float | list[float] = 1e-3,
+        max_lr: float | list[float] = 1.0,
+        eps: float = 1e-8,
+        verbose: bool = False,
+        smooth: bool = False,
+        window_size: int = 50,
+        reset_start: int = 500,
+    ) -> None:
+        if factor >= 1.0:
+            raise ValueError("Factor should be < 1.0.")
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
+        self.optimizer = optimizer
+        self.factor = factor
+        self.patience = patience
+        self.verbose = verbose
+        self.cooldown = cooldown
+        self.warmup = warmup
+        self.cooldown_counter = 0
+        self.warmup_counter = 0
+        self.mode = mode
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.eps = eps
+        self.smooth = smooth
+        self.window_size = window_size
+        self.reset_start = reset_start
+        self.reset_start_original = reset_start
+        self.last_epoch = 0
+        if isinstance(min_lr, (list, tuple)):
+            if len(min_lr) != len(optimizer.param_groups):
+                raise ValueError(f"expected {len(optimizer.param_groups)} min_lrs, got {len(min_lr)}")
+            self.min_lrs = list(min_lr)
+        else:
+            self.min_lrs = [min_lr] * len(optimizer.param_groups)
+        if isinstance(max_lr, (list, tuple)):
+            if len(max_lr) != len(optimizer.param_groups):
+                raise ValueError(f"expected {len(optimizer.param_groups)} max_lrs, got {len(max_lr)}")
+            self.max_lrs = list(max_lr)
+        else:
+            self.max_lrs = [max_lr] * len(optimizer.param_groups)
+        self._init_lrs = [group["lr"] for group in optimizer.param_groups]
+        self._last_lr = self._init_lrs.copy()
+        self.best: float = float("inf") if mode == "min" else float("-inf")
+        self.num_bad_epochs = 0
+        self.num_good_epochs = 0
+        if mode not in ("min", "max"):
+            raise ValueError(f"mode {mode} is unknown!")
+        if threshold_mode not in ("rel", "abs"):
+            raise ValueError(f"threshold mode {threshold_mode} is unknown!")
+        self._streaming_avg: StreamingAverage | None = None
+        if smooth:
+            self._streaming_avg = StreamingAverage(window_size)
+    def step(self, metrics: float, epoch: int | None = None) -> None:
+        """Perform a scheduler step based on the given metrics.
+        Args:
+            metrics (`float`):
+                The metric value to use for LR adjustment decisions.
+            epoch (`int`, *optional*):
+                The current epoch number. If None, uses internal counter.
+        """
+        current = float(metrics)
+        if self.smooth and self._streaming_avg is not None:
+            current = self._streaming_avg.streamavg(current)
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch
+        if self.cooldown_counter > 0:
+            self.cooldown_counter -= 1
+            self.num_bad_epochs = 0
+            self.num_good_epochs = 0
+        elif self.warmup_counter > 0:
+            self.warmup_counter -= 1
+            self.num_bad_epochs = 0
+            self.num_good_epochs = 0
+        else:
+            if self.is_better(current, self.best):
+                self.best = current
+                self.num_bad_epochs = 0
+                self.num_good_epochs += 1
+            else:
+                self.num_bad_epochs += 1
+                self.num_good_epochs = 0
+            if self.num_good_epochs > self.patience:
+                self._increase_lr(epoch)
+                self.warmup_counter = self.warmup
+                self.num_good_epochs = 0
+            elif self.num_bad_epochs > self.patience:
+                self._reduce_lr(epoch)
+                self.cooldown_counter = self.cooldown
+                self.num_bad_epochs = 0
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+    def is_better(self, current: float, best: float) -> bool:
+        if self.mode == "min":
+            if self.threshold_mode == "rel":
+                return current < best * (1.0 - self.threshold)
+            else:
+                return current < best - self.threshold
+        else:
+            if self.threshold_mode == "rel":
+                return current > best * (1.0 + self.threshold)
+            else:
+                return current > best + self.threshold
+    def _reduce_lr(self, epoch: int) -> None:
+        all_at_min = True
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            old_lr = float(param_group["lr"])
+            new_lr = max(old_lr * self.factor, self.min_lrs[i])
+            if old_lr - new_lr > self.eps:
+                param_group["lr"] = new_lr
+                if self.verbose:
+                    print(f"Epoch {epoch}: reducing learning rate of group {i} to {new_lr:.4e}.")
+            if param_group["lr"] > self.min_lrs[i]:
+                all_at_min = False
+        if all_at_min:
+            self.reset_start -= 1
+            if self.reset_start <= 0:
+                self._reset()
+    def _increase_lr(self, epoch: int) -> None:
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            old_lr = float(param_group["lr"])
+            new_lr = min(old_lr / self.factor, self.max_lrs[i])
+            if new_lr - old_lr > self.eps:
+                param_group["lr"] = new_lr
+                if self.verbose:
+                    print(f"Epoch {epoch}: increasing learning rate of group {i} to {new_lr:.4e}.")
+        self.reset_start = self.reset_start_original
+    def _reset(self) -> None:
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            param_group["lr"] = self._init_lrs[i]
+        self.best = float("inf") if self.mode == "min" else float("-inf")
+        self.num_bad_epochs = 0
+        self.num_good_epochs = 0
+        self.cooldown_counter = 0
+        self.warmup_counter = 0
+        self.reset_start = self.reset_start_original
+        if self.smooth and self._streaming_avg is not None:
+            self._streaming_avg = StreamingAverage(self.window_size)
+        if self.verbose:
+            print("Scheduler reset to initial state.")
+    def get_last_lr(self) -> list[float]:
+        """Return last computed learning rate by current scheduler."""
+        return self._last_lr
+    def state_dict(self) -> dict[str, Any]:
+        """Return the state of the scheduler as a dictionary."""
+        state = {
+            "factor": self.factor,
+            "min_lrs": self.min_lrs,
+            "max_lrs": self.max_lrs,
+            "patience": self.patience,
+            "verbose": self.verbose,
+            "cooldown": self.cooldown,
+            "warmup": self.warmup,
+            "cooldown_counter": self.cooldown_counter,
+            "warmup_counter": self.warmup_counter,
+            "mode": self.mode,
+            "threshold": self.threshold,
+            "threshold_mode": self.threshold_mode,
+            "best": self.best,
+            "num_bad_epochs": self.num_bad_epochs,
+            "num_good_epochs": self.num_good_epochs,
+            "eps": self.eps,
+            "last_epoch": self.last_epoch,
+            "smooth": self.smooth,
+            "window_size": self.window_size,
+            "reset_start": self.reset_start,
+            "reset_start_original": self.reset_start_original,
+            "_last_lr": self._last_lr,
+            "_init_lrs": self._init_lrs,
+        }
+        if self.smooth and self._streaming_avg is not None:
+            state["_streaming_avg"] = self._streaming_avg.state_dict()
+        return state
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        """Load state from a dictionary."""
+        self.factor = state_dict.get("factor", self.factor)
+        self.min_lrs = state_dict.get("min_lrs", self.min_lrs)
+        self.max_lrs = state_dict.get("max_lrs", self.max_lrs)
+        self.patience = state_dict.get("patience", self.patience)
+        self.verbose = state_dict.get("verbose", self.verbose)
+        self.cooldown = state_dict.get("cooldown", self.cooldown)
+        self.warmup = state_dict.get("warmup", self.warmup)
+        self.cooldown_counter = state_dict.get("cooldown_counter", self.cooldown_counter)
+        self.warmup_counter = state_dict.get("warmup_counter", self.warmup_counter)
+        self.mode = state_dict.get("mode", self.mode)
+        self.threshold = state_dict.get("threshold", self.threshold)
+        self.threshold_mode = state_dict.get("threshold_mode", self.threshold_mode)
+        self.best = state_dict.get("best", self.best)
+        self.num_bad_epochs = state_dict.get("num_bad_epochs", self.num_bad_epochs)
+        self.num_good_epochs = state_dict.get("num_good_epochs", self.num_good_epochs)
+        self.eps = state_dict.get("eps", self.eps)
+        self.last_epoch = state_dict.get("last_epoch", self.last_epoch)
+        self.smooth = state_dict.get("smooth", self.smooth)
+        self.window_size = state_dict.get("window_size", self.window_size)
+        self.reset_start = state_dict.get("reset_start", self.reset_start)
+        self.reset_start_original = state_dict.get("reset_start_original", self.reset_start_original)
+        self._last_lr = state_dict.get("_last_lr", self._last_lr)
+        self._init_lrs = state_dict.get("_init_lrs", self._init_lrs)
+        if "_streaming_avg" in state_dict:
+            if self._streaming_avg is None:
+                self._streaming_avg = StreamingAverage(self.window_size)
+            self._streaming_avg.load_state_dict(state_dict["_streaming_avg"])
+        if "_last_lr" in state_dict:
+            for param_group, lr in zip(self.optimizer.param_groups, self._last_lr):
+                param_group["lr"] = lr
+def get_greedy_schedule(optimizer: Optimizer, **kwargs):
+    """
+    Create an adaptive learning rate scheduler that adjusts LR based on training metrics.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        kwargs (`dict`, *optional*):
+            Extra parameters passed to the scheduler. See [`GreedyLR`] for possible parameters.
+    Return:
+        [`GreedyLR`] with the appropriate schedule.
+    """
+    return GreedyLR(optimizer, **kwargs)
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
+    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+    SchedulerType.CONSTANT: get_constant_schedule,
+    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
+    SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
+    SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup,
+    SchedulerType.COSINE_WARMUP_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup_lr_rate,
+    SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule,
+    SchedulerType.GREEDY: get_greedy_schedule,
+}
+def get_scheduler(
+    name: str | SchedulerType,
+    optimizer: Optimizer,
+    num_warmup_steps: int | None = None,
+    num_training_steps: int | None = None,
+    scheduler_specific_kwargs: dict | None = None,
+):
+    """
+    Unified API to get any scheduler from its name.
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        scheduler_specific_kwargs (`dict`, *optional*):
+            Extra parameters for schedulers such as cosine with restarts. Mismatched scheduler types and scheduler
+            parameters will cause the scheduler function to raise a TypeError.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    # If a `LayerWiseDummyOptimizer` is passed we extract the optimizer dict and
+    # recursively call `get_scheduler` to get the proper schedulers on each parameter
+    if optimizer is not None and isinstance(optimizer, LayerWiseDummyOptimizer):
+        optimizer_dict = optimizer.optimizer_dict
+        scheduler_dict = {}
+        for param in optimizer_dict:
+            scheduler_dict[param] = get_scheduler(
+                name,
+                optimizer=optimizer_dict[param],
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                scheduler_specific_kwargs=scheduler_specific_kwargs,
+            )
+        def scheduler_hook(param):
+            # Since the optimizer hook has been already attached we only need to
+            # attach the scheduler hook, the gradients have been zeroed here
+            scheduler_dict[param].step()
+        for param in optimizer_dict:
+            if param.requires_grad:
+                param.register_post_accumulate_grad_hook(scheduler_hook)
+        return LayerWiseDummyScheduler(optimizer_dict=optimizer_dict, lr=optimizer.defaults["lr"])
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer)
+    if scheduler_specific_kwargs is None:
+        scheduler_specific_kwargs = {}
+    if name == SchedulerType.REDUCE_ON_PLATEAU:
+        return schedule_func(optimizer, **scheduler_specific_kwargs)
+    if name == SchedulerType.GREEDY:
+        return schedule_func(optimizer, **scheduler_specific_kwargs)
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+    if name == SchedulerType.INVERSE_SQRT:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **scheduler_specific_kwargs)
+    # wsd scheduler requires either num_training_steps or num_stable_steps
+    if name == SchedulerType.WARMUP_STABLE_DECAY:
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            **scheduler_specific_kwargs,
+        )
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+    return schedule_func(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        **scheduler_specific_kwargs,
+    )
+class Adafactor(Optimizer):
+    """
+    AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
+    https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
+    Paper: *Adafactor: Adaptive Learning Rates with Sublinear Memory Cost* https://huggingface.co/papers/1804.04235 Note that
+    this optimizer internally adjusts the learning rate depending on the `scale_parameter`, `relative_step` and
+    `warmup_init` options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
+    `relative_step=False`.
+    Arguments:
+        params (`Iterable[nn.parameter.Parameter]`):
+            Iterable of parameters to optimize or dictionaries defining parameter groups.
+        lr (`float`, *optional*):
+            The external learning rate.
+        eps (`tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`):
+            Regularization constants for square gradient and parameter scale respectively
+        clip_threshold (`float`, *optional*, defaults to 1.0):
+            Threshold of root mean square of final gradient update
+        decay_rate (`float`, *optional*, defaults to -0.8):
+            Coefficient used to compute running averages of square
+        beta1 (`float`, *optional*):
+            Coefficient used for computing running averages of gradient
+        weight_decay (`float`, *optional*, defaults to 0.0):
+            Weight decay (L2 penalty)
+        scale_parameter (`bool`, *optional*, defaults to `True`):
+            If True, learning rate is scaled by root mean square
+        relative_step (`bool`, *optional*, defaults to `True`):
+            If True, time-dependent learning rate is computed instead of external learning rate
+        warmup_init (`bool`, *optional*, defaults to `False`):
+            Time-dependent learning rate computation depends on whether warm-up initialization is being used
+    This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
+    Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3):
+        - Training without LR warmup or clip_threshold is not recommended.
+           - use scheduled LR warm-up to fixed LR
+           - use clip_threshold=1.0 (https://huggingface.co/papers/1804.04235)
+        - Disable relative updates
+        - Use scale_parameter=False
+        - Additional optimizer operations like gradient clipping should not be used alongside Adafactor
+    Example:
+    ```python
+    Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
+    ```
+    Others reported the following combination to work well:
+    ```python
+    Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
+    ```
+    When using `lr=None` with [`Trainer`] you will most likely need to use [`~optimization.AdafactorSchedule`]
+    scheduler as following:
+    ```python
+    from transformers.optimization import Adafactor, AdafactorSchedule
+    optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
+    lr_scheduler = AdafactorSchedule(optimizer)
+    trainer = Trainer(..., optimizers=(optimizer, lr_scheduler))
+    ```
+    Usage:
+    ```python
+    # replace AdamW with Adafactor
+    optimizer = Adafactor(
+        model.parameters(),
+        lr=1e-3,
+        eps=(1e-30, 1e-3),
+        clip_threshold=1.0,
+        decay_rate=-0.8,
+        beta1=None,
+        weight_decay=0.0,
+        relative_step=False,
+        scale_parameter=False,
+        warmup_init=False,
+    )
+    ```"""
+    def __init__(
+        self,
+        params,
+        lr=None,
+        eps=(1e-30, 1e-3),
+        clip_threshold=1.0,
+        decay_rate=-0.8,
+        beta1=None,
+        weight_decay=0.0,
+        scale_parameter=True,
+        relative_step=True,
+        warmup_init=False,
+    ):
+        if lr is not None and relative_step:
+            raise ValueError("Cannot combine manual `lr` and `relative_step=True` options")
+        if warmup_init and not relative_step:
+            raise ValueError("`warmup_init=True` requires `relative_step=True`")
+        defaults = {
+            "lr": lr,
+            "eps": eps,
+            "clip_threshold": clip_threshold,
+            "decay_rate": decay_rate,
+            "beta1": beta1,
+            "weight_decay": weight_decay,
+            "scale_parameter": scale_parameter,
+            "relative_step": relative_step,
+            "warmup_init": warmup_init,
+        }
+        super().__init__(params, defaults)
+    @staticmethod
+    def _get_lr(param_group, param_state):
+        rel_step_sz = param_group["lr"]
+        if param_group["relative_step"]:
+            min_step = 1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-2
+            rel_step_sz = min(min_step, 1.0 / math.sqrt(param_state["step"]))
+        param_scale = 1.0
+        if param_group["scale_parameter"]:
+            param_scale = max(param_group["eps"][1], param_state["RMS"])
+        return param_scale * rel_step_sz
+    @staticmethod
+    def _get_options(param_group, param_shape):
+        factored = len(param_shape) >= 2
+        use_first_moment = param_group["beta1"] is not None
+        return factored, use_first_moment
+    @staticmethod
+    def _rms(tensor):
+        return tensor.norm(2) / (tensor.numel() ** 0.5)
+    @staticmethod
+    def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col):
+        # copy from fairseq's adafactor implementation:
+        # https://github.com/huggingface/transformers/blob/8395f14de6068012787d83989c3627c3df6a252b/src/transformers/optimization.py#L505
+        r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1)
+        c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
+        return torch.mul(r_factor, c_factor)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """
+        Performs a single optimization step
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                if grad.is_sparse:
+                    raise RuntimeError("Adafactor does not support sparse gradients.")
+                state = self.state[p]
+                grad_shape = grad.shape
+                factored, use_first_moment = self._get_options(group, grad_shape)
+                # State Initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    if use_first_moment:
+                        # Exponential moving average of gradient values
+                        state["exp_avg"] = torch.zeros_like(grad)
+                    if factored:
+                        state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
+                        state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+                    else:
+                        state["exp_avg_sq"] = torch.zeros_like(grad)
+                    state["RMS"] = 0
+                else:
+                    if use_first_moment:
+                        state["exp_avg"] = state["exp_avg"].to(grad)
+                    if factored:
+                        state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
+                        state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
+                    else:
+                        state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
+                p_data_fp32 = p
+                if p.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+                state["step"] += 1
+                state["RMS"] = self._rms(p_data_fp32)
+                lr = self._get_lr(group, state)
+                beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
+                update = (grad**2) + group["eps"][0]
+                if factored:
+                    exp_avg_sq_row = state["exp_avg_sq_row"]
+                    exp_avg_sq_col = state["exp_avg_sq_col"]
+                    exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t))
+                    exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t))
+                    # Approximation of exponential moving average of square of gradient
+                    update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+                    update.mul_(grad)
+                else:
+                    exp_avg_sq = state["exp_avg_sq"]
+                    exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t))
+                    update = exp_avg_sq.rsqrt().mul_(grad)
+                update.div_((self._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
+                update.mul_(lr)
+                if use_first_moment:
+                    exp_avg = state["exp_avg"]
+                    exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"]))
+                    update = exp_avg
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr))
+                p_data_fp32.add_(-update)
+                if p.dtype in {torch.float16, torch.bfloat16}:
+                    p.copy_(p_data_fp32)
+        return loss
+class AdafactorSchedule(LambdaLR):
+    """
+    Since [`~optimization.Adafactor`] performs its own scheduling, if the training loop relies on a scheduler (e.g.,
+    for logging), this class creates a proxy object that retrieves the current lr values from the optimizer.
+    It returns `initial_lr` during startup and the actual `lr` during stepping.
+    """
+    def __init__(self, optimizer, initial_lr=0.0):
+        def lr_lambda(_):
+            return initial_lr
+        for group in optimizer.param_groups:
+            group["initial_lr"] = initial_lr
+        super().__init__(optimizer, lr_lambda)
+        for group in optimizer.param_groups:
+            del group["initial_lr"]
+    def get_lr(self):
+        opt = self.optimizer
+        lrs = [
+            opt._get_lr(group, opt.state[group["params"][0]])
+            for group in opt.param_groups
+            if group["params"][0].grad is not None
+        ]
+        if len(lrs) == 0:
+            lrs = self.base_lrs  # if called before stepping
+        return lrs
+def get_adafactor_schedule(optimizer, initial_lr=0.0):
+    """
+    Get a proxy schedule for [`~optimization.Adafactor`]
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        initial_lr (`float`, *optional*, defaults to 0.0):
+            Initial lr
+    Return:
+        [`~optimization.Adafactor`] proxy schedule object.
+    """
+    return AdafactorSchedule(optimizer, initial_lr)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/tokenization_python.py ADDED Viewed

	@@ -0,0 +1,1420 @@

+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
+tokenization_utils_tokenizers.py
+"""
+import bisect
+import unicodedata
+from collections import OrderedDict
+from typing import Any, overload
+from .tokenization_utils_base import (
+    INIT_TOKENIZER_DOCSTRING,
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    PreTrainedTokenizerBase,
+    TextInput,
+    TruncationStrategy,
+)
+from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+logger = logging.get_logger(__name__)
+# Slow tokenizers are saved in a vocabulary plus three separated files
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+class Trie:
+    """
+    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
+    Loose reference https://en.wikipedia.org/wiki/Trie
+    """
+    def __init__(self, *args):
+        self.data = {}
+        self._tokens = set()
+        self._termination_char = ""
+        self.update(*args)
+    def update(self, *args):
+        """
+        Updates the Trie with new tokens provided as arguments.
+        Args:
+            *args: Variable number of words to be added to the Trie.
+        """
+        for token in tuple(*args):
+            self.add(token)
+    def add(self, word: str):
+        """
+        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
+        The special key `""` in `self._termination_char` is used to represent termination.
+        This function is idempotent, adding twice the same word will leave the trie unchanged
+        Example:
+        ```python
+        >>> trie = Trie()
+        >>> trie.add("Hello 友達")
+        >>> trie.data
+        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
+        >>> trie.add("Hello")
+        >>> trie.data
+        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
+        ```
+        """
+        if not word:
+            # Prevent empty string
+            return
+        self._tokens.add(word)
+        ref = self.data
+        for char in word:
+            ref[char] = ref.setdefault(char, {})
+            ref = ref[char]
+        ref[self._termination_char] = 1
+    def split(self, text: str) -> list[str]:
+        """
+        Will look for the words added to the trie within `text`. Output is the original string split along the
+        boundaries of the words found.
+        This trie will match the longest possible word first !
+        Example:
+        ```python
+        >>> trie = Trie()
+        >>> trie.split("[CLS] This is a extra_id_100")
+        ["[CLS] This is a extra_id_100"]
+        >>> trie.add("[CLS]")
+        >>> trie.add("extra_id_1")
+        >>> trie.add("extra_id_100")
+        >>> trie.split("[CLS] This is a extra_id_100")
+        ["[CLS]", " This is a ", "extra_id_100"]
+        ```
+        """
+        # indexes are counted left of the chars index.
+        # "hello", index 0, is left of h, index 1 is between h and e.
+        # index 5 is right of the "o".
+        # States are going to capture every possible start (indexes as above)
+        # as keys, and have as values, a pointer to the position in the trie
+        # where we're at. This is a partial match for now.
+        # This enables to keep track of multiple matches while we're iterating
+        # the string
+        # If the trie contains, "blowing", and "lower" and we encounter the
+        # string "blower", we need to split into ["b", "lower"].
+        # This is where we need to keep track of multiple possible starts.
+        states = OrderedDict()
+        # This will contain every indices where we need
+        # to cut.
+        # We force to cut at offset 0 and len(text) (added later)
+        offsets = [0]
+        # This is used by the lookahead which needs to skip over
+        # some text where the full match exceeded the place in the initial
+        # for loop
+        skip = 0
+        # Main loop, Giving this algorithm O(n) complexity
+        for current, current_char in enumerate(text):
+            if skip and current < skip:
+                # Prevents the lookahead for matching twice
+                # like extra_id_100 and id_100
+                continue
+            # This will track every state
+            # that stop matching, we need to stop tracking them.
+            # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
+            # fail on "b", we need to remove 0 from the valid states.
+            to_remove = set()
+            # Whenever we found a match, we need to drop everything
+            # this is a greedy algorithm, it will match on the first found token
+            reset = False
+            # In this case, we already have partial matches (But unfinished)
+            for start, trie_pointer in states.items():
+                if "" in trie_pointer:
+                    # This is a final match, we need to reset and
+                    # store the results in `offsets`.
+                    # Lookahead to match longest first
+                    # Important in case of extra_id_1 vs extra_id_100
+                    # Here we are also actively looking for other earlier partial
+                    # matches
+                    # "[CLS]", "L", we need to match CLS even if L is special
+                    for lookstart, looktrie_pointer in states.items():
+                        if lookstart > start:
+                            # This partial match is later, we can stop looking
+                            break
+                        elif lookstart < start:
+                            # This partial match is earlier, the trie pointer
+                            # was already updated, so index is + 1
+                            lookahead_index = current + 1
+                            end = current + 1
+                        else:
+                            # Here lookstart == start and
+                            #      looktrie_pointer == trie_pointer
+                            # It wasn't updated yet so indices are current ones
+                            lookahead_index = current
+                            end = current
+                        next_char = text[lookahead_index] if lookahead_index < len(text) else None
+                        if "" in looktrie_pointer:
+                            start = lookstart
+                            end = lookahead_index
+                            skip = lookahead_index
+                        while next_char in looktrie_pointer:
+                            looktrie_pointer = looktrie_pointer[next_char]
+                            lookahead_index += 1
+                            if "" in looktrie_pointer:
+                                start = lookstart
+                                end = lookahead_index
+                                skip = lookahead_index
+                            if lookahead_index == len(text):
+                                # End of string
+                                break
+                            next_char = text[lookahead_index]
+                        # End lookahead
+                    # Storing and resetting
+                    offsets.append(start)
+                    offsets.append(end)
+                    reset = True
+                    break
+                elif current_char in trie_pointer:
+                    # The current character being looked at has a match within the trie
+                    # update the pointer (it will be stored back into states later).
+                    trie_pointer = trie_pointer[current_char]
+                    # Storing back the new pointer into the states.
+                    # Partial matches got longer by one.
+                    states[start] = trie_pointer
+                else:
+                    # The new character has not match in the trie, we need
+                    # to stop keeping track of this partial match.
+                    # We can't do it directly within the loop because of how
+                    # python iteration works
+                    to_remove.add(start)
+            # Either clearing the full start (we found a real match)
+            # Or clearing only the partial matches that didn't work.
+            if reset:
+                states = {}
+            else:
+                for start in to_remove:
+                    del states[start]
+            # If this character is a starting character within the trie
+            # start keeping track of this partial match.
+            if current >= skip and current_char in self.data:
+                states[current] = self.data[current_char]
+        # We have a cut at the end with states.
+        for start, trie_pointer in states.items():
+            if "" in trie_pointer:
+                # This is a final match, we need to reset and
+                # store the results in `offsets`.
+                end = len(text)
+                offsets.append(start)
+                offsets.append(end)
+                # Longest cut is always the one with lower start so the first
+                # item so we need to break.
+                break
+        return self.cut_text(text, offsets)
+    def cut_text(self, text, offsets):
+        # We have all the offsets now, we just need to do the actual splitting.
+        # We need to eventually add the first part of the string and the eventual
+        # last part.
+        offsets.append(len(text))
+        tokens = []
+        start = 0
+        for end in offsets:
+            if start > end:
+                logger.error(
+                    "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
+                    " anyway."
+                )
+                continue
+            elif start == end:
+                # This might happen if there's a match at index 0
+                # we're also preventing zero-width cuts in case of two
+                # consecutive matches
+                continue
+            tokens.append(text[start:end])
+            start = end
+        return tokens
+class ExtensionsTrie(Trie):
+    def __init__(self, *args):
+        super().__init__(*args)
+    def extensions(self, prefix: str):
+        """
+        Generates all extensions of a given prefix token in the Trie.
+        Example:
+        ```python
+        >>> trie = Trie()
+        >>> trie.add("apple")
+        >>> trie.add("app")
+        >>> trie.add("application")
+        >>> trie.extensions("app")
+        ['app', 'apple', 'application']
+        ```
+        """
+        prefix_node = self._get_node(prefix)
+        ret = self._collect_tokens(prefix_node)
+        return [prefix + token for token in ret]
+    def _get_node(self, token: str) -> dict:
+        """
+        Retrieves the node corresponding to the given token in the Trie.
+        Args:
+            token (str): The token for which the corresponding node needs to be retrieved.
+        Returns:
+            dict: The node in the Trie corresponding to the given token.
+        """
+        node = self.data
+        for char in token:
+            if char not in node:
+                break
+            node = node[char]
+        return node
+    def _collect_tokens(self, node: dict) -> list:
+        """
+        Generates all tokens in the Trie starting from a given node.
+        Args:
+            node (dict): The node in the Trie from which tokens need to be generated.
+        Returns:
+            list: List of tokens generated from the given node.
+        """
+        tokens = [self._termination_char] if self._termination_char in node else []
+        for token, subtrie_head in node.items():
+            if token != self._termination_char:
+                subtokens = self._collect_tokens(subtrie_head)
+                tokens.extend([token + subtoken for subtoken in subtokens])
+        return tokens
+def _is_whitespace(char):
+    """Checks whether `char` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `char` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `char` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+def _is_end_of_word(text):
+    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
+    last_char = text[-1]
+    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
+def _is_start_of_word(text):
+    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
+    first_char = text[0]
+    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
+def _insert_one_token_to_ordered_list(token_list: list[str], new_token: str):
+    """
+    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
+    """
+    insertion_idx = bisect.bisect_left(token_list, new_token)
+    # Checks if new_token is already in the ordered token_list
+    if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
+        # new_token is in token_list, don't add
+        return
+    else:
+        token_list.insert(insertion_idx, new_token)
+@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
+class PythonBackend(PreTrainedTokenizerBase):
+    """
+    Base class for all slow tokenizers.
+    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
+    pretrained tokenizers as well as adding tokens to the vocabulary.
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+    """
+    def __init__(self, **kwargs):
+        # 1. Init the parent class
+        self.tokens_trie = Trie()
+        # Initialize total_vocab_size early to avoid issues if get_vocab() is called early (custom tokenizers)
+        self.total_vocab_size = 0
+        # 2. init `_added_tokens_decoder` if child class did not
+        if not hasattr(self, "_added_tokens_decoder"):
+            self._added_tokens_decoder: dict[int, AddedToken] = {}
+        # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
+        self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
+        self._added_tokens_encoder: dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
+        # 4. Token type ID configuration for dynamic mask building
+        # These can be overridden by subclasses to avoid overriding create_token_type_ids_from_sequences
+        self.token_type_ids_pattern = kwargs.pop("token_type_ids_pattern", "bert_style")  # "all_zeros" or "bert_style"
+        self.token_type_ids_include_special_tokens = kwargs.pop("token_type_ids_include_special_tokens", True)
+        # 5. Special tokens mask configuration
+        # Patterns: "none", "cls_sep", "eos", "bos", "bos_eos", "cls_double_sep", "prefix_suffix"
+        self.special_tokens_pattern = kwargs.pop("special_tokens_pattern", None)
+        # 6. Set backend to "custom" if not already set (for direct PreTrainedTokenizer subclasses)
+        if "backend" not in kwargs:
+            kwargs["backend"] = "custom"
+        # 7. init the parent class
+        super().__init__(**kwargs)
+        # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
+        # V5: the order of addition follows self.SPECIAL_TOKENS_ATTRIBUTES, then extra special tokens
+        # Note: _add_tokens will automatically skip tokens that are already in the base vocab
+        self._add_tokens(
+            [token for token in self.all_special_tokens if token not in self._added_tokens_encoder],
+            special_tokens=True,
+        )
+    @property
+    def is_fast(self) -> bool:
+        return False
+    @property
+    def added_tokens_encoder(self) -> dict[str, int]:
+        """
+        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
+        """
+        return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
+    @property
+    def added_tokens_decoder(self) -> dict[int, AddedToken]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+        Returns:
+            `dict[str, int]`: The added tokens.
+        """
+        return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
+    @added_tokens_decoder.setter
+    def added_tokens_decoder(self, value: dict[int, AddedToken | str]) -> dict[int, AddedToken]:
+        # Always raise an error if string because users should define the behavior
+        for index, token in value.items():
+            if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
+                raise TypeError(
+                    f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, AddedToken | str}"
+                )
+            self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
+            self._added_tokens_encoder[str(token)] = index
+        self._update_total_vocab_size()
+    def get_added_vocab(self) -> dict[str, int]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
+        the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
+        something we should change.
+        Returns:
+            `dict[str, int]`: The added tokens.
+        """
+        return self._added_tokens_encoder
+    def __len__(self):
+        """
+        Size of the full vocabulary with the added tokens.
+        """
+        # Lazy evaluation: compute if not already set (e.g., during initialization)
+        if self.total_vocab_size == 0:
+            self._update_total_vocab_size()
+        return self.total_vocab_size
+    def _update_total_vocab_size(self):
+        """
+        Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
+        otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
+        is only updated when adding tokens.
+        """
+        self.total_vocab_size = len(self.get_vocab())
+    def _add_tokens(self, new_tokens: list[str] | list[AddedToken], special_tokens: bool = False) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
+        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
+        vocab which is why they have to be handled specifically.
+        Args:
+            new_tokens (`list[str]`or `list[tokenizers.AddedToken]`):
+                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
+                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
+                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
+                stripping and normalization of this token. This is NOT possible in `tokenizers`.
+            special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the tokens should be added as special tokens.
+        Returns:
+            `int`: The number of tokens actually added to the vocabulary.
+        Examples:
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
+        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
+        added_tokens = 0
+        if new_tokens is None:
+            return added_tokens
+        # TODO this is fairly slow to improve!
+        current_vocab = self.get_vocab().copy()
+        new_idx = len(current_vocab)  # only call this once, len gives the last index + 1
+        for token in new_tokens:
+            if not isinstance(token, (str, AddedToken)):
+                raise TypeError(f"Token {token} is not a string but a {type(token)}.")
+            if str(token) == "":
+                continue
+            if isinstance(token, str):
+                if token in self._added_tokens_encoder:
+                    continue
+                else:
+                    # very important for fast and slow equivalence!
+                    is_special = token in self.all_special_tokens or special_tokens
+                    token = AddedToken(
+                        token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special
+                    )
+            elif special_tokens:
+                # doing token.special=True changes the normalization! will fix in rust
+                # this is important and the only reason why the AddedTokens in each class are normalized by default
+                token.__setstate__({"special": True, "normalized": token.normalized})
+            if token in self._added_tokens_decoder:
+                continue
+            if not token.special and token.normalized and getattr(self, "do_lower_case", False):
+                # Normalize if requested
+                token.content = token.content.lower()
+            if token.content not in current_vocab:
+                token_index = new_idx + added_tokens
+                current_vocab[token.content] = token_index
+                added_tokens += 1
+            else:
+                token_index = current_vocab[token.content]
+            if token.special and str(token) not in self.all_special_tokens:
+                self._extra_special_tokens.append(token)
+            # the setter automatically updates the reverse map
+            self._added_tokens_decoder[token_index] = token
+            self._added_tokens_encoder[token.content] = token_index
+            if self.verbose:
+                logger.info(f"Adding {token} to the vocabulary")
+        self._update_trie()
+        self._update_total_vocab_size()
+        return added_tokens
+    def _update_trie(self, unique_no_split_tokens: list[str] | None = None):
+        for token in self._added_tokens_decoder.values():
+            if token.content not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token.content)
+        for token in unique_no_split_tokens or []:
+            if token not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token)
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+        <Tip>
+        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
+        this inside your training loop.
+        </Tip>
+        Args:
+            pair (`bool`, *optional*, defaults to `False`):
+                Whether the number of added tokens should be computed in the case of a sequence pair or a single
+                sequence.
+        Returns:
+            `int`: Number of special tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+    def tokenize(self, text: TextInput, **kwargs) -> list[str]:
+        """
+        Converts a string into a sequence of tokens, using the tokenizer.
+        Args:
+            text: The sequence to be encoded.
+            **kwargs: Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+        Returns:
+            The list of tokens.
+        """
+        split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+        if split_special_tokens:
+            # Don't split on any tokens - just tokenize directly
+            return self._tokenize(text)
+        # Split on added tokens
+        tokens = self.tokens_trie.split(text)
+        no_split_token = self._added_tokens_encoder.keys()
+        # Handle added token properties (lstrip, rstrip, single_word)
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token])
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        tokens[i + 1] = right.lstrip()
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()
+                    if tok_extended.single_word:
+                        if left and left[-1] != " ":
+                            tokens[i - 1] += token
+                            tokens[i] = ""
+                        elif right and right[0] != " ":
+                            tokens[i + 1] = token + tokens[i + 1]
+                            tokens[i] = ""
+        # Tokenize non-added tokens
+        result = []
+        all_special_tokens_set = set(self.all_special_tokens)
+        for token in tokens:
+            if not token:
+                continue
+            if token in no_split_token or token in all_special_tokens_set:
+                result.append(token)
+            else:
+                result.extend(self._tokenize(token))
+        return result
+    def _tokenize(self, text, **kwargs):
+        """
+        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+    def _convert_token_to_id(self, token):
+        raise NotImplementedError
+    def _encode_plus(
+        self,
+        text: TextInput | PreTokenizedInput | EncodedInput,
+        text_pair: TextInput | PreTokenizedInput | EncodedInput | None = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: int | None = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: int | None = None,
+        padding_side: str | None = None,
+        return_tensors: str | TensorType | None = None,
+        return_token_type_ids: bool | None = None,
+        return_attention_mask: bool | None = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Detect batched inputs (list of sequences)
+        is_batched = isinstance(text, (list, tuple)) and (
+            (not text and not is_split_into_words)
+            or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
+            or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
+        )
+        if is_batched:
+            if text_pair is not None:
+                if not isinstance(text_pair, (list, tuple)) or len(text_pair) != len(text):
+                    raise ValueError("If `text` is a batch, `text_pair` must also be a batch of the same length.")
+            pairs = text_pair if text_pair is not None else [None] * len(text)
+            batch_outputs = {}
+            for current_text, current_pair in zip(text, pairs):
+                # Handle tuples/lists as sequence pairs like ("text1", "text2")
+                # For is_split_into_words=True: only unpack if it's a tuple of exactly 2 sequences (pair)
+                # Otherwise, treat the list as a single pretokenized sequence
+                if (
+                    isinstance(current_text, (list, tuple))
+                    and current_text
+                    and not isinstance(current_text[0], int)
+                    and current_pair is None
+                ):
+                    # Check if this looks like a pair: tuple/list of length 2 where elements are strings or lists/tuples
+                    is_pair = (
+                        len(current_text) == 2
+                        and (isinstance(current_text[0], str) or isinstance(current_text[0], (list, tuple)))
+                        and (isinstance(current_text[1], str) or isinstance(current_text[1], (list, tuple)))
+                    )
+                    if is_pair:
+                        current_text, current_pair = current_text
+                    elif len(current_text) == 1:
+                        current_text = current_text[0]
+                    elif not is_split_into_words:
+                        # Only raise error for non-pretokenized input
+                        raise ValueError(f"Expected a pair of sequences, got {len(current_text)} sequences.")
+                current_output = self._encode_plus(
+                    text=current_text,
+                    text_pair=current_pair,
+                    add_special_tokens=add_special_tokens,
+                    padding_strategy=PaddingStrategy.DO_NOT_PAD,  # we pad in batch afterward
+                    truncation_strategy=truncation_strategy,
+                    max_length=max_length,
+                    stride=stride,
+                    is_split_into_words=is_split_into_words,
+                    pad_to_multiple_of=None,  # we pad in batch afterward
+                    padding_side=None,  # we pad in batch afterward
+                    return_tensors=None,  # We convert the whole batch to tensors at the end
+                    return_token_type_ids=return_token_type_ids,
+                    return_attention_mask=False,  # we pad in batch afterward
+                    return_overflowing_tokens=return_overflowing_tokens,
+                    return_special_tokens_mask=return_special_tokens_mask,
+                    return_length=return_length,
+                    verbose=verbose,
+                    **kwargs,
+                )
+                for key, value in current_output.items():
+                    batch_outputs.setdefault(key, []).append(value)
+            # Remove overflow-related keys before tensor conversion if return_tensors is set
+            # Slow tokenizers don't support returning these as tensors
+            if return_tensors and return_overflowing_tokens:
+                batch_outputs.pop("overflowing_tokens", None)
+                batch_outputs.pop("num_truncated_tokens", None)
+            batch_outputs = self.pad(
+                batch_outputs,
+                padding=padding_strategy.value,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+            return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+        # Single sequence handling
+        def get_input_ids(text):
+            if isinstance(text, str):
+                # Normal case: tokenize string
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+            if isinstance(text, (list, tuple)) and text:
+                if isinstance(text[0], int):
+                    return text
+                # Pre-tokenized strings
+                if isinstance(text[0], str):
+                    if is_split_into_words:
+                        return self.convert_tokens_to_ids(
+                            [tok for word in text for tok in self.tokenize(word, **kwargs)]
+                        )
+                    return self.convert_tokens_to_ids(text)
+            raise ValueError(f"Input must be a string, list of strings, or list of ints, got: {type(text)}")
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, **kwargs
+    ) -> tuple[str, dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+        Args:
+            text (`str`):
+                The text to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            kwargs (`dict[str, Any]`, *optional*):
+                Keyword arguments to use for the tokenization.
+        Returns:
+            `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        return (text, kwargs)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: list[int] | None = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequences by adding special tokens.
+        This method dynamically builds inputs based on the tokenizer's `special_tokens_pattern`:
+        - `"none"`: No special tokens
+        - `"cls_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+        - `"eos"`: seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+        - `"bos"`: [BOS] seq0 or [BOS] seq0 [BOS] seq1
+        - `"bos_eos"`: [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+        - `"cls_double_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+        - `"prefix_suffix"`: `<prefix_tokens> seq0 [seq1] <suffix_tokens>` (custom prefix/suffix stored on the tokenizer)
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of input IDs with the appropriate special tokens.
+        """
+        if self.special_tokens_pattern == "cls_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+            if self.cls_token_id is None and self.sep_token_id is None:
+                raise ValueError(
+                    "Cannot add special tokens following 'cls_sep' pattern because one or several special tokens "
+                    f"are not defined (cls_token_id={self.cls_token_id}; sep_token_id={self.sep_token_id})"
+                    "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`"
+                )
+            if token_ids_1 is None:
+                return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
+        elif self.special_tokens_pattern == "eos":
+            # seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+            if self.eos_token_id is None:
+                raise ValueError(
+                    "Cannot add special tokens following 'eos' pattern because eos token is not defined "
+                    f"(eos_token_id={self.eos_token_id})."
+                    "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`"
+                )
+            if token_ids_1 is None:
+                return token_ids_0 + [self.eos_token_id]
+            return token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+        elif self.special_tokens_pattern == "bos":
+            # [BOS] seq0 or [BOS] seq0 [BOS] seq1
+            if self.bos_token_id is None:
+                raise ValueError(
+                    "Cannot add special tokens following 'bos' pattern because bos token is not defined "
+                    f"(bos_token_id={self.bos_token_id})."
+                    "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`"
+                )
+            if token_ids_1 is None:
+                return [self.bos_token_id] + token_ids_0
+            return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
+        elif self.special_tokens_pattern == "bos_eos":
+            # [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+            if self.bos_token_id is None and self.eos_token_id is None:
+                raise ValueError(
+                    "Cannot add special tokens following 'bos_eos' pattern because one or several special tokens "
+                    f"are not defined (bos_token_id={self.bos_token_id}; eos_token_id={self.eos_token_id})"
+                    "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`"
+                )
+                return token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1
+            if token_ids_1 is None:
+                return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+        elif self.special_tokens_pattern == "cls_double_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+            if self.cls_token_id is None and self.sep_token_id is None:
+                raise ValueError(
+                    "Cannot add special tokens following 'cls_double_sep' pattern because one or several special tokens "
+                    f"are not defined (cls_token_id={self.cls_token_id}; sep_token_id={self.sep_token_id})"
+                    "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`"
+                )
+            if token_ids_1 is None:
+                return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+            return (
+                [self.cls_token_id]
+                + token_ids_0
+                + [self.sep_token_id, self.sep_token_id]
+                + token_ids_1
+                + [self.sep_token_id]
+            )
+        elif self.special_tokens_pattern == "prefix_suffix":
+            prefix_tokens = getattr(self, "prefix_tokens", [])
+            suffix_tokens = getattr(self, "suffix_tokens", [])
+            if token_ids_1 is None:
+                return prefix_tokens + token_ids_0 + suffix_tokens
+            return prefix_tokens + token_ids_0 + token_ids_1 + suffix_tokens
+        else:  # "none" or any other value
+            # No special tokens
+            if token_ids_1 is None:
+                return token_ids_0
+            return token_ids_0 + token_ids_1
+    def get_special_tokens_mask(
+        self, token_ids_0: list, token_ids_1: list | None = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+        This method dynamically builds the special tokens mask based on the tokenizer's `special_tokens_pattern`:
+        - `"none"`: No special tokens (default, returns all 0s)
+        - `"cls_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+        - `"eos"`: seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+        - `"bos"`: [BOS] seq0 or [BOS] seq0 [BOS] seq1
+        - `"bos_eos"`: [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+        - `"cls_double_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+        - `"prefix_suffix"`: `<prefix_tokens> seq0 [seq1] <suffix_tokens>`
+        Args:
+            token_ids_0 (`list[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (`list[int]`, *optional*):
+                List of ids of the second sequence.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if self.special_tokens_pattern == "cls_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0)) + [1]
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        elif self.special_tokens_pattern == "eos":
+            # seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+            if token_ids_1 is None:
+                return ([0] * len(token_ids_0)) + [1]
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        elif self.special_tokens_pattern == "bos":
+            # [BOS] seq0 or [BOS] seq0 [BOS] seq1
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0))
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+        elif self.special_tokens_pattern == "bos_eos":
+            # [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0)) + [1]
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        elif self.special_tokens_pattern == "cls_double_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0)) + [1]
+            return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        elif self.special_tokens_pattern == "prefix_suffix":
+            prefix_len = len(getattr(self, "prefix_tokens", []))
+            suffix_len = len(getattr(self, "suffix_tokens", []))
+            mask = [1] * prefix_len + ([0] * len(token_ids_0))
+            if token_ids_1 is not None:
+                mask += [0] * len(token_ids_1)
+            mask += [1] * suffix_len
+            return mask
+        else:
+            return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+    @overload
+    def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
+    @overload
+    def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ...
+    def convert_ids_to_tokens(self, ids: int | list[int], skip_special_tokens: bool = False) -> str | list[str]:
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+        added tokens.
+        Args:
+            ids (`int` or `list[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+        Returns:
+            `str` or `list[str]`: The decoded token(s).
+        """
+        if isinstance(ids, int):
+            return (
+                self._added_tokens_decoder[ids].content
+                if ids in self._added_tokens_decoder
+                else self._convert_id_to_token(ids)
+            )
+        tokens = []
+        # self.all_special_ids is an @property which may be slow, so only compute it once before the loop
+        ids_to_skip = set(self.all_special_ids) if skip_special_tokens else set()
+        for index in ids:
+            index = int(index)
+            if index in ids_to_skip:
+                continue
+            tokens.append(
+                self._added_tokens_decoder[index].content
+                if index in self._added_tokens_decoder
+                else self._convert_id_to_token(index)
+            )
+        return tokens
+    def _convert_id_to_token(self, index: int) -> str:
+        raise NotImplementedError
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return " ".join(tokens)
+    def _decode(
+        self,
+        token_ids: int | list[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool | None = None,
+        **kwargs,
+    ) -> str:
+        """Decode token ids to string."""
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        if isinstance(filtered_tokens, str):
+            filtered_tokens = [filtered_tokens]
+        text = self.convert_tokens_to_string(filtered_tokens)
+        # Apply tokenizer-specific cleanup if available and requested
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            text = self.clean_up_tokenization(text)
+        return text
+    def prepare_for_model(
+        self,
+        ids: list[int],
+        pair_ids: list[int] | None = None,
+        add_special_tokens: bool = True,
+        padding: bool | str | PaddingStrategy = False,
+        truncation: bool | str | TruncationStrategy = False,
+        max_length: int | None = None,
+        stride: int = 0,
+        pad_to_multiple_of: int | None = None,
+        padding_side: str | None = None,
+        return_tensors: str | TensorType | None = None,
+        return_token_type_ids: bool | None = None,
+        return_attention_mask: bool | None = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input ids so it can be used by the model. Adds special tokens, truncates, and pads.
+        Args:
+            ids: Tokenized input ids of the first sequence.
+            pair_ids: Tokenized input ids of the second sequence (optional).
+        """
+        # Get padding/truncation strategies
+        padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+        # Validation
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+        # Defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        # Truncation
+        pair = pair_ids is not None
+        num_special = self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0
+        total_len = len(ids) + len(pair_ids or []) + num_special
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+        else:
+            sequence = ids + (pair_ids if pair_ids else [])
+            token_type_ids = [0] * len(sequence)
+        # Build output
+        encoded_inputs = {"input_ids": sequence}
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            encoded_inputs["special_tokens_mask"] = (
+                self.get_special_tokens_mask(ids, pair_ids) if add_special_tokens else [0] * len(sequence)
+            )
+        if return_overflowing_tokens and not return_tensors and overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
+        # Check sequence length and warn if needed
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+        # Pad
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+        return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
+    def truncate_sequences(
+        self,
+        ids: list[int],
+        pair_ids: list[int] | None = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: str | TruncationStrategy = "longest_first",
+        stride: int = 0,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """Truncates sequences according to the specified strategy."""
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+        overflowing_tokens = []
+        # ONLY_FIRST or LONGEST_FIRST with single sequence
+        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
+            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
+        ):
+            window_len = min(len(ids), stride + num_tokens_to_remove)
+            if self.truncation_side == "left":
+                overflowing_tokens = ids[:window_len]
+                ids = ids[num_tokens_to_remove:]
+            else:
+                overflowing_tokens = ids[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+        # LONGEST_FIRST with pair
+        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            logger.warning(
+                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
+                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
+                "truncation strategy. So the returned list will always be empty even if some "
+                "tokens have been removed."
+            )
+            len_ids, len_pair = len(ids), len(pair_ids) if pair_ids else 0
+            first_remove = min(abs(len_pair - len_ids), num_tokens_to_remove)
+            second_remove = num_tokens_to_remove - first_remove
+            if len_ids > len_pair:
+                ids_to_move = first_remove + second_remove // 2
+                pair_ids_to_move = second_remove - second_remove // 2
+            else:
+                ids_to_move = second_remove // 2
+                pair_ids_to_move = first_remove + second_remove - (second_remove // 2)
+            if self.truncation_side == "right":
+                ids = ids[:-ids_to_move] if ids_to_move > 0 else ids
+                pair_ids = pair_ids[:-pair_ids_to_move] if pair_ids and pair_ids_to_move > 0 else pair_ids
+            else:
+                ids = ids[ids_to_move:]
+                pair_ids = pair_ids[pair_ids_to_move:] if pair_ids else None
+        # ONLY_SECOND
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids:
+            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+            if self.truncation_side == "right":
+                overflowing_tokens = pair_ids[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+            else:
+                overflowing_tokens = pair_ids[:window_len]
+                pair_ids = pair_ids[num_tokens_to_remove:]
+        return ids, pair_ids, overflowing_tokens
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: list[int] | None = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        This method dynamically builds the token type IDs based on the tokenizer's configuration attributes:
+        - `token_type_ids_pattern`: Pattern to use ("all_zeros" or "bert_style")
+        - `token_type_ids_include_special_tokens`: Whether to account for special tokens in length calculation
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: Token type IDs according to the configured pattern.
+        Examples:
+            ```python
+            # All zeros pattern (default, used by RoBERTa, BART, etc.)
+            tokenizer.token_type_ids_pattern = "all_zeros"
+            # Returns: [0, 0, 0, ...] for both sequences
+            # BERT-style pattern (first sequence gets 0s, second gets 1s)
+            tokenizer.token_type_ids_pattern = "bert_style"
+            # Returns: [0, 0, 0, ..., 1, 1, 1, ...] for sequence pairs
+            ```
+        """
+        # Calculate lengths - account for special tokens if configured
+        if self.token_type_ids_include_special_tokens:
+            # Build the full sequence to get accurate length
+            if token_ids_1 is None:
+                sequence = self.build_inputs_with_special_tokens(token_ids_0)
+                seq0_len = len(sequence)
+                seq1_len = 0
+            else:
+                full_sequence = self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
+                # Approximate split - this works for most tokenizers
+                # For more complex cases, subclasses should still override
+                seq0_with_special = self.build_inputs_with_special_tokens(token_ids_0)
+                seq0_len = len(seq0_with_special)
+                seq1_len = len(full_sequence) - seq0_len
+        else:
+            # Use raw token lengths
+            seq0_len = len(token_ids_0)
+            seq1_len = len(token_ids_1) if token_ids_1 is not None else 0
+        # Build token type IDs based on pattern
+        if self.special_tokens_pattern == "prefix_suffix":
+            total_len = len(getattr(self, "prefix_tokens", [])) + len(token_ids_0)
+            if token_ids_1 is not None:
+                total_len += len(token_ids_1)
+            total_len += len(getattr(self, "suffix_tokens", []))
+            return [0] * total_len
+        if self.token_type_ids_pattern == "bert_style" and token_ids_1 is not None:
+            # BERT-style: first sequence gets 0s, second sequence gets 1s
+            return [0] * seq0_len + [1] * seq1_len
+        else:
+            # All zeros pattern (default): everything gets 0s
+            return [0] * (seq0_len + seq1_len)
+    def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
+        """
+        Default implementation for common vocabulary saving patterns.
+        Saves self.encoder/self.vocab as JSON, optionally with self.bpe_ranks as merges.
+        Returns empty tuple if no vocabulary exists.
+        Override this method if your tokenizer needs custom saving logic (e.g., SentencePiece models,
+        multiple vocabulary files, or special file formats).
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `tuple[str, ...]`: Paths to the files saved, or empty tuple if no files saved.
+        """
+        import json
+        import os
+        vocab_attr = getattr(self, "encoder", None) or getattr(self, "vocab", None)
+        if vocab_attr is None:
+            return ()
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return ()
+        vocab_files_names = getattr(self, "vocab_files_names", {})
+        prefix = f"{filename_prefix}-" if filename_prefix else ""
+        # Save vocabulary
+        vocab_file = os.path.join(save_directory, prefix + vocab_files_names.get("vocab_file", "vocab.json"))
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(vocab_attr, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        # Save BPE merges if present
+        bpe_ranks = getattr(self, "bpe_ranks", None)
+        if bpe_ranks is None:
+            return (vocab_file,)
+        merge_file = os.path.join(save_directory, prefix + vocab_files_names.get("merges_file", "merges.txt"))
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            if getattr(self, "add_bpe_version_header", False):
+                writer.write("#version: 0.2\n")
+            index = 0
+            for bpe_tokens, token_index in sorted(bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        return (vocab_file, merge_file)
+# Backward compatibility alias
+PreTrainedTokenizer = PythonBackend

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/video_utils.py ADDED Viewed

	@@ -0,0 +1,893 @@

+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import warnings
+from collections.abc import Callable, Iterable, Mapping
+from contextlib import redirect_stdout
+from dataclasses import dataclass, fields
+from io import BytesIO
+from typing import NewType, Union
+from urllib.parse import urlparse
+import httpx
+import numpy as np
+from .image_transforms import PaddingMode, to_channel_dimension_format
+from .image_utils import ChannelDimension, infer_channel_dimension_format, is_valid_image
+from .utils import (
+    is_av_available,
+    is_cv2_available,
+    is_decord_available,
+    is_numpy_array,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchcodec_available,
+    is_torchvision_available,
+    is_vision_available,
+    is_yt_dlp_available,
+    logging,
+    requires_backends,
+)
+if is_vision_available():
+    import PIL.Image
+    if is_torchvision_available():
+        from torchvision import io as torchvision_io
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+URL = NewType("URL", str)
+Path = NewType("Path", str)
+VideoInput = Union[
+    list["PIL.Image.Image"],
+    np.ndarray,
+    "torch.Tensor",
+    list[np.ndarray],
+    list["torch.Tensor"],
+    list[list["PIL.Image.Image"]],
+    list[list[np.ndarray]],
+    list[list["torch.Tensor"]],
+    URL,
+    list[URL],
+    list[list[URL]],
+    Path,
+    list[Path],
+    list[list[Path]],
+]
+@dataclass
+class VideoMetadata(Mapping):
+    total_num_frames: int
+    fps: float | None = None
+    width: int | None = None
+    height: int | None = None
+    duration: float | None = None
+    video_backend: str | None = None
+    frames_indices: list[int] | None = None
+    def __iter__(self):
+        return (f.name for f in fields(self))
+    def __len__(self):
+        return len(fields(self))
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    @property
+    def timestamps(self) -> list[float]:
+        "Timestamps of the sampled frames in seconds."
+        if self.fps is None or self.frames_indices is None:
+            raise ValueError("Cannot infer video `timestamps` when `fps` or `frames_indices` is None.")
+        return [frame_idx / self.fps for frame_idx in self.frames_indices]
+    @property
+    def sampled_fps(self) -> float:
+        "FPS of the sampled video."
+        if self.frames_indices is None or self.total_num_frames is None or self.fps is None:
+            return self.fps or 24
+        return len(self.frames_indices) / self.total_num_frames * self.fps
+    def update(self, dictionary):
+        for key, value in dictionary.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+VideoMetadataType = VideoMetadata | dict | list[dict | VideoMetadata] | list[list[dict | VideoMetadata]]
+def is_valid_video_frame(frame):
+    return isinstance(frame, PIL.Image.Image) or (
+        (is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3
+    )
+def is_valid_video(video):
+    if not isinstance(video, (list, tuple)):
+        return (is_numpy_array(video) or is_torch_tensor(video)) and video.ndim == 4
+    return video and all(is_valid_video_frame(frame) for frame in video)
+def valid_videos(videos):
+    # If we have a list of videos, it could be either one video as list of frames or a batch
+    if isinstance(videos, (list, tuple)):
+        for video_or_frame in videos:
+            if not (is_valid_video(video_or_frame) or is_valid_video_frame(video_or_frame)):
+                return False
+    # If not a list, then we have a single 4D video or 5D batched tensor
+    elif not is_valid_video(videos) or videos.ndim == 5:
+        return False
+    return True
+def is_batched_video(videos):
+    if isinstance(videos, (list, tuple)):
+        return is_valid_video(videos[0])
+    elif (is_numpy_array(videos) or is_torch_tensor(videos)) and videos.ndim == 5:
+        return True
+    return False
+def is_scaled_video(video: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    # It's possible the video has pixel values in [0, 255] but is of floating type
+    return np.min(video) >= 0 and np.max(video) <= 1
+def convert_pil_frames_to_video(videos: list[VideoInput]) -> list[Union[np.ndarray, "torch.Tensor"]]:
+    """
+    Given a batch of videos, converts each video to a 4D array. If video is already in array type,
+    it is simply returned. We assume that all inputs in the list are in the same format, based on the type of the first element.
+    Args:
+        videos (`VideoInput`):
+            Video inputs to turn into a list of videos.
+    """
+    if not (isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0])):
+        return videos
+    video_converted = []
+    for video in videos:
+        video = [np.array(frame) for frame in video]
+        video = np.stack(video)
+        video_converted.append(video)
+    return video_converted
+def make_batched_videos(videos) -> list[Union[np.ndarray, "torch.Tensor", "URL", "Path"]]:
+    """
+    Ensure that the input is a list of videos. If the input is a single video, it is converted to a list of length 1.
+    If the input is a batch of videos, it is converted to a list of 4D video arrays. Videos passed as list `PIL.Image`
+    frames are converted to 4D arrays.
+    We assume that all inputs in the list are in the same format, based on the type of the first element.
+    Args:
+        videos (`VideoInput`):
+            Video inputs to turn into a list of videos.
+    """
+    # Early exit for deeply nested list of image frame paths. We shouldn't flatten them
+    try:
+        if isinstance(videos[0][0], list) and isinstance(videos[0][0][0], str):
+            return [image_paths for sublist in videos for image_paths in sublist]
+    except (IndexError, TypeError):
+        pass
+    if is_batched_video(videos):
+        return convert_pil_frames_to_video(list(videos))
+    elif isinstance(videos, str) or is_valid_video(videos):
+        return convert_pil_frames_to_video([videos])
+    # only one frame passed, thus we unsqueeze time dim
+    elif is_valid_image(videos):
+        if isinstance(videos, PIL.Image.Image):
+            videos = np.array(videos)
+        return [videos[None, ...]]
+    elif not isinstance(videos, list):
+        raise ValueError(
+            f"Invalid video input. Expected either a list of video frames or an input of 4 or 5 dimensions, but got"
+            f" type {type(videos)}."
+        )
+    # Recursively flatten any nested structure
+    flat_videos_list = []
+    for item in videos:
+        if isinstance(item, str) or is_valid_video(item):
+            flat_videos_list.append(item)
+        elif isinstance(item, list) and item:
+            flat_videos_list.extend(make_batched_videos(item))
+    flat_videos_list = convert_pil_frames_to_video(flat_videos_list)
+    return flat_videos_list
+def make_batched_metadata(videos: VideoInput, video_metadata: VideoMetadataType) -> list[VideoMetadata]:
+    if video_metadata is None:
+        # Create default metadata and fill attributes we can infer from given video
+        video_metadata = [
+            {
+                "total_num_frames": len(video),
+                "fps": None,
+                "duration": None,
+                "frames_indices": list(range(len(video))),
+                "height": get_video_size(video)[0] if is_valid_video(video) else None,
+                "width": get_video_size(video)[1] if is_valid_video(video) else None,
+            }
+            for video in videos
+        ]
+    if isinstance(video_metadata, list):
+        # Flatten if nested list
+        if isinstance(video_metadata[0], list):
+            video_metadata = [
+                VideoMetadata(**metadata) for metadata_list in video_metadata for metadata in metadata_list
+            ]
+        # Simply wrap in VideoMetadata if simple dict
+        elif isinstance(video_metadata[0], dict):
+            video_metadata = [VideoMetadata(**metadata) for metadata in video_metadata]
+    else:
+        # Create a batched list from single object
+        video_metadata = [VideoMetadata(**video_metadata)]
+    return video_metadata
+def get_video_size(video: np.ndarray, channel_dim: ChannelDimension | None = None) -> tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the video.
+    Args:
+        video (`np.ndarray`):
+            The video to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the video.
+    Returns:
+        A tuple of the video's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension_format(video, num_channels=(1, 3, 4))
+    if channel_dim == ChannelDimension.FIRST:
+        return video.shape[-2], video.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return video.shape[-3], video.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+def get_uniform_frame_indices(total_num_frames: int, num_frames: int | None = None):
+    """
+    Creates a numpy array for uniform sampling of `num_frame` frames from `total_num_frames`
+    when loading a video.
+    Args:
+        total_num_frames (`int`):
+            Total number of frames that a video has.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not specified, all frames are sampled.
+    Returns:
+        np.ndarray: np array of frame indices that will be sampled.
+    """
+    if num_frames is not None:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(int)
+    else:
+        indices = np.arange(0, total_num_frames).astype(int)
+    return indices
+def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None, **kwargs):
+    """
+    A default sampling function that replicates the logic used in get_uniform_frame_indices,
+    while optionally handling `fps` if `num_frames` is not provided.
+    Args:
+        metadata (`VideoMetadata`):
+            `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly.
+        fps (`int` or `float`, *optional*):
+            Desired frames per second. Takes priority over num_frames if both are provided.
+    Returns:
+        `np.ndarray`: Array of frame indices to sample.
+    """
+    total_num_frames = metadata.total_num_frames
+    video_fps = metadata.fps
+    # If num_frames is not given but fps is, calculate num_frames from fps
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we computed num_frames={num_frames} "
+                f"which exceeds total_num_frames={total_num_frames}. Check fps or video metadata."
+            )
+    if num_frames is not None:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int)
+    else:
+        indices = np.arange(0, total_num_frames, dtype=int)
+    return indices
+def read_video_opencv(
+    video_path: Union["URL", "Path"],
+    sample_indices_fn: Callable,
+    **kwargs,
+) -> tuple[np.ndarray, VideoMetadata]:
+    """
+    Decode a video using the OpenCV backend.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_indices_fn (`Callable`):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniform sampling with fps is performed.
+            Example:
+            def sample_indices_fn(metadata, **kwargs):
+                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
+    Returns:
+        tuple[`np.ndarray`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import cv2
+    requires_backends(read_video_opencv, ["cv2"])
+    import cv2
+    video = cv2.VideoCapture(video_path)
+    total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    video_fps = video.get(cv2.CAP_PROP_FPS)
+    duration = total_num_frames / video_fps if video_fps else 0
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="opencv",
+        height=int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+        width=int(video.get(cv2.CAP_PROP_FRAME_WIDTH)),
+    )
+    indices = sample_indices_fn(metadata=metadata, **kwargs)
+    index = 0
+    frames = []
+    while video.isOpened():
+        success, frame = video.read()
+        if not success:
+            break
+        if index in indices:
+            height, width, channel = frame.shape
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(frame[0:height, 0:width, 0:channel])
+        if success:
+            index += 1
+        if index >= total_num_frames:
+            break
+    video.release()
+    metadata.frames_indices = indices
+    return np.stack(frames), metadata
+def read_video_decord(
+    video_path: Union["URL", "Path"],
+    sample_indices_fn: Callable,
+    **kwargs,
+):
+    """
+    Decode a video using the Decord backend.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_indices_fn (`Callable`):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniform sampling with fps is performed.
+            Example:
+            def sample_indices_fn(metadata, **kwargs):
+                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import from decord
+    requires_backends(read_video_decord, ["decord"])
+    from decord import VideoReader, cpu
+    vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    duration = total_num_frames / video_fps if video_fps else 0
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="decord",
+    )
+    indices = sample_indices_fn(metadata=metadata, **kwargs)
+    video = vr.get_batch(indices).asnumpy()
+    metadata.update(
+        {
+            "frames_indices": indices,
+            "height": video.shape[1],
+            "width": video.shape[2],
+        }
+    )
+    return video, metadata
+def read_video_pyav(
+    video_path: Union["URL", "Path"],
+    sample_indices_fn: Callable,
+    **kwargs,
+):
+    """
+    Decode the video with PyAV decoder.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_indices_fn (`Callable`, *optional*):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniform sampling with fps is performed.
+            Example:
+            def sample_indices_fn(metadata, **kwargs):
+                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import av
+    requires_backends(read_video_pyav, ["av"])
+    import av
+    container = av.open(video_path)
+    total_num_frames = container.streams.video[0].frames
+    video_fps = container.streams.video[0].average_rate  # should we better use `av_guess_frame_rate`?
+    duration = total_num_frames / video_fps if video_fps else 0
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="pyav",
+        height=container.streams.video[0].height,
+        width=container.streams.video[0].width,
+    )
+    indices = sample_indices_fn(metadata=metadata, **kwargs)
+    frames = []
+    container.seek(0)
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= 0 and i in indices:
+            frames.append(frame)
+    video = np.stack([x.to_ndarray(format="rgb24") for x in frames])
+    metadata.frames_indices = indices
+    return video, metadata
+def read_video_torchvision(
+    video_path: Union["URL", "Path"],
+    sample_indices_fn: Callable,
+    **kwargs,
+):
+    """
+    Decode the video with torchvision decoder.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_indices_fn (`Callable`, *optional*):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniform sampling with fps is performed.
+            Example:
+            def sample_indices_fn(metadata, **kwargs):
+                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
+    Returns:
+        tuple[`torch.Tensor`, `VideoMetadata`]: A tuple containing:
+            - Torch tensor of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    warnings.warn(
+        "Using `torchvision` for video decoding is deprecated and will be removed in future versions. "
+        "Please use `torchcodec` instead."
+    )
+    video, _, info = torchvision_io.read_video(
+        video_path,
+        start_pts=0.0,
+        end_pts=None,
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+    video_fps = info["video_fps"]
+    total_num_frames = video.size(0)
+    duration = total_num_frames / video_fps if video_fps else 0
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="torchvision",
+    )
+    indices = sample_indices_fn(metadata=metadata, **kwargs)
+    video = video[indices].contiguous()
+    metadata.update(
+        {
+            "frames_indices": indices,
+            "height": video.shape[2],
+            "width": video.shape[3],
+        }
+    )
+    return video, metadata
+def read_video_torchcodec(
+    video_path: Union["URL", "Path"],
+    sample_indices_fn: Callable,
+    **kwargs,
+):
+    """
+    Decode the video with torchcodec decoder.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_indices_fn (`Callable`):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniform sampling with fps is performed.
+            Example:
+            def sample_indices_fn(metadata, **kwargs):
+                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
+    Returns:
+        Tuple[`torch.Tensor`, `VideoMetadata`]: A tuple containing:
+            - Torch tensor of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    requires_backends(read_video_torchcodec, ["torchcodec"])
+    from torchcodec.decoders import VideoDecoder
+    # VideoDecoder expects a string for device, default to "cpu" if None
+    decoder = VideoDecoder(
+        video_path,
+        # Interestingly `exact` mode takes less than approximate when we load the whole video
+        seek_mode="exact",
+        # Allow FFmpeg decide on the number of threads for efficiency
+        num_ffmpeg_threads=0,
+        device=kwargs.get("device", "cpu"),
+    )
+    total_num_frames = decoder.metadata.num_frames
+    video_fps = decoder.metadata.average_fps
+    metadata = VideoMetadata(
+        total_num_frames=total_num_frames,
+        fps=video_fps,
+        duration=decoder.metadata.duration_seconds,
+        video_backend="torchcodec",
+        height=decoder.metadata.height,
+        width=decoder.metadata.width,
+    )
+    indices = sample_indices_fn(metadata=metadata, **kwargs)
+    video = decoder.get_frames_at(indices=indices).data.contiguous()
+    metadata.frames_indices = indices
+    return video, metadata
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "opencv": read_video_opencv,
+    "pyav": read_video_pyav,
+    "torchvision": read_video_torchvision,
+    "torchcodec": read_video_torchcodec,
+}
+def load_video(
+    video: VideoInput,
+    num_frames: int | None = None,
+    fps: int | float | None = None,
+    backend: str = "pyav",
+    sample_indices_fn: Callable | None = None,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Loads `video` to a numpy array.
+    Args:
+        video (`VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not passed, the whole video is loaded.
+        fps (`int` or `float`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
+        backend (`str`, *optional*, defaults to `"pyav"`):
+            The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision", "torchcodec"]. Defaults to "pyav".
+        sample_indices_fn (`Callable`, *optional*):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
+            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
+            indices at which the video should be sampled. For example:
+            Example:
+            def sample_indices_fn(metadata, **kwargs):
+                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
+    Returns:
+        tuple[`np.ndarray`, Dict]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - Metadata dictionary.
+    """
+    # If `sample_indices_fn` is given, we can accept any args as those might be needed by custom `sample_indices_fn`
+    if fps is not None and num_frames is not None and sample_indices_fn is None:
+        raise ValueError(
+            "`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!"
+        )
+    # If user didn't pass a sampling function, create one on the fly with default logic
+    if sample_indices_fn is None:
+        def sample_indices_fn_func(metadata, **fn_kwargs):
+            return default_sample_indices_fn(metadata, num_frames=num_frames, fps=fps, **fn_kwargs)
+        sample_indices_fn = sample_indices_fn_func
+    # Early exit if provided an array or `PIL` frames
+    if not isinstance(video, str):
+        metadata = [None] * len(video)
+        return video, metadata
+    if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
+        if not is_yt_dlp_available():
+            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        # Lazy import from yt_dlp
+        requires_backends(load_video, ["yt_dlp"])
+        from yt_dlp import YoutubeDL
+        buffer = BytesIO()
+        with redirect_stdout(buffer), YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(httpx.get(video, follow_redirects=True).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    else:
+        raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend == "opencv":
+        raise ValueError("If you are trying to load a video from URL, you cannot use 'opencv' as backend")
+    if (
+        (not is_decord_available() and backend == "decord")
+        or (not is_av_available() and backend == "pyav")
+        or (not is_cv2_available() and backend == "opencv")
+        or (not is_torchvision_available() and backend == "torchvision")
+        or (not is_torchcodec_available() and backend == "torchcodec")
+    ):
+        raise ImportError(
+            f"You chose backend={backend} for loading the video but the required library is not found in your environment "
+            f"Make sure to install {backend} before loading the video."
+        )
+    video_decoder = VIDEO_DECODERS[backend]
+    video, metadata = video_decoder(file_obj, sample_indices_fn, **kwargs)
+    return video, metadata
+def convert_to_rgb(
+    video: np.ndarray,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Convert video to RGB by blending the transparency layer if it's in RGBA format, otherwise simply returns it.
+    Args:
+        video (`np.ndarray`):
+            The video to convert.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input video. If unset, will use the inferred format from the input.
+    """
+    if not isinstance(video, np.ndarray):
+        raise TypeError(f"Video has to be a numpy array to convert to RGB format, but found {type(video)}")
+    # np.array usually comes with ChannelDimension.LAST so let's convert it
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(video)
+    video = to_channel_dimension_format(video, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+    # 3 channels for RGB already
+    if video.shape[-3] == 3:
+        return video
+    # Grayscale video so we repeat it 3 times for each channel
+    if video.shape[-3] == 1:
+        return video.repeat(3, -3)
+    if not (video[..., 3, :, :] < 255).any():
+        return video
+    # There is a transparency layer, blend it with a white background.
+    # Calculate the alpha proportion for blending.
+    alpha = video[..., 3, :, :] / 255.0
+    video = (1 - alpha[..., None, :, :]) * 255 + alpha[..., None, :, :] * video[..., 3, :, :]
+    return video
+def pad(
+    video: np.ndarray,
+    padding: int | tuple[int, int] | Iterable[tuple[int, int]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: float | Iterable[float] = 0.0,
+    data_format: str | ChannelDimension | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Pads the `video` with the specified (height, width) `padding` and `mode`.
+    Args:
+        video (`np.ndarray`):
+            The video to pad.
+        padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
+            Padding to apply to the edges of the height, width axes. Can be one of three formats:
+            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
+            - `((before, after),)` yields same before and after pad for height and width.
+            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
+        mode (`PaddingMode`):
+            The padding mode to use. Can be one of:
+                - `"constant"`: pads with a constant value.
+                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                  vector along each axis.
+                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+        constant_values (`float` or `Iterable[float]`, *optional*):
+            The value to use for the padding if `mode` is `"constant"`.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output video. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_frames, num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: video in (num_frames, height, width, num_channels) format.
+            If unset, will use same as the input video.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input video. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_frames, num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: video in (num_frames, height, width, num_channels) format.
+            If unset, will use the inferred format of the input video.
+    Returns:
+        `np.ndarray`: The padded video.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(video)
+    def _expand_for_data_format(values):
+        """
+        Convert values to be in the format expected by np.pad based on the data format.
+        """
+        if isinstance(values, (int, float)):
+            values = ((values, values), (values, values))
+        elif isinstance(values, tuple) and len(values) == 1:
+            values = ((values[0], values[0]), (values[0], values[0]))
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
+            values = (values, values)
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
+            pass
+        else:
+            raise ValueError(f"Unsupported format: {values}")
+        # add 0 for channel dimension
+        values = (
+            ((0, 0), (0, 0), *values) if input_data_format == ChannelDimension.FIRST else ((0, 0), *values, (0, 0))
+        )
+        # Add additional padding if there's a batch dimension
+        values = (0, *values) if video.ndim == 5 else values
+        return values
+    padding_map = {
+        PaddingMode.CONSTANT: "constant",
+        PaddingMode.REFLECT: "reflect",
+        PaddingMode.REPLICATE: "replicate",
+        PaddingMode.SYMMETRIC: "symmetric",
+    }
+    padding = _expand_for_data_format(padding)
+    pad_kwargs = {}
+    if mode not in padding_map:
+        raise ValueError(f"Invalid padding mode: {mode}")
+    elif mode == PaddingMode.CONSTANT:
+        pad_kwargs["constant_values"] = _expand_for_data_format(constant_values)
+    video = np.pad(video, padding, mode=padding_map[mode], **pad_kwargs)
+    video = to_channel_dimension_format(video, data_format, input_data_format) if data_format is not None else video
+    return video
+def group_videos_by_shape(
+    videos: list["torch.Tensor"],
+) -> tuple[dict[tuple[int, int], "torch.Tensor"], dict[int, tuple[tuple[int, int], int]]]:
+    """
+    Groups videos by shape.
+    Returns a dictionary with the shape as key and a list of videos with that shape as value,
+    and a dictionary with the index of the video in the original list as key and the shape and index in the grouped list as value.
+    """
+    grouped_videos = {}
+    grouped_videos_index = {}
+    for i, video in enumerate(videos):
+        shape = video.shape[-2::]
+        num_frames = video.shape[-4]  # video format BTCHW
+        shape = (num_frames, *shape)
+        if shape not in grouped_videos:
+            grouped_videos[shape] = []
+        grouped_videos[shape].append(video)
+        grouped_videos_index[i] = (shape, len(grouped_videos[shape]) - 1)
+    # stack videos with the same size and number of frames
+    grouped_videos = {shape: torch.stack(videos, dim=0) for shape, videos in grouped_videos.items()}
+    return grouped_videos, grouped_videos_index
+def reorder_videos(
+    processed_videos: dict[tuple[int, int], "torch.Tensor"],
+    grouped_videos_index: dict[int, tuple[tuple[int, int], int]],
+) -> list["torch.Tensor"]:
+    """
+    Reconstructs a list of videos in the original order.
+    """
+    return [
+        processed_videos[grouped_videos_index[i][0]][grouped_videos_index[i][1]]
+        for i in range(len(grouped_videos_index))
+    ]