Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

added_tokens.json +0 -0
config.json +38 -0
configuration_dream.py +86 -0
generate_from_llada.py +294 -0
generation_config.json +16 -0
generation_utils.py +706 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_dream.py +1781 -0
special_tokens_map.json +55 -0
tokenization_dream.py +340 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "architectures": [
+    "DreamModel"
+  ],
+  "attention_dropout": 0.0,
+  "audio_model_name_or_path": "/data/lijiang/code/Dream/22_local//cognitron_vl_magvit//cognitron_mm/models/dream",
+  "auto_map": {
+    "AutoConfig": "configuration_dream.DreamConfig",
+    "AutoModel": "modeling_dream.DreamModel",
+    "AutoModelForCausalLM": "modeling_dream.DreamModel"
+  },
+  "bos_token_id": 151643,
+  "chunk_size": -1,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "mask_token_id": 151666,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "Dream",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 176264
+}

configuration_dream.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dream model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class DreamConfig(PretrainedConfig):
+    model_type = "Dream"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=False,  # cache not used in diffusion
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        mask_token_id=151666,
+        pad_token_id=151643,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.mask_token_id = mask_token_id
+        self.pad_token_id = pad_token_id

generate_from_llada.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+def add_gumbel_noise(logits, temperature):
+    '''
+    The Gumbel max is a method for sampling categorical distributions.
+    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
+    Thus, we use float64.
+    '''
+    if temperature == 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    gumbel_noise = (- torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    '''
+    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
+    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
+    the expected number of tokens transitioned at each step should be consistent.
+    This function is designed to precompute the number of tokens that need to be transitioned at each step.
+    '''
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+    for i in range(mask_num.size(0)):
+        num_transfer_tokens[i, :remainder[i]] += 1
+    return num_transfer_tokens
+def get_num_transfer_tokens_sch(mask_index, steps,schedule=None,schedule_kwargs=None):
+    '''
+    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
+    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
+    the expected number of tokens transitioned at each step should be consistent.
+    This function is designed to precompute the number of tokens that need to be transitioned at each step.
+    '''
+    if schedule is None:
+        return get_num_transfer_tokens(mask_index,steps)
+    if schedule_kwargs is None:
+        schedule_kwargs = {}
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    steps = int(min(steps,mask_num[0]))
+    t = torch.linspace(0, 1, steps+1)
+    # at least one sample per step
+    if schedule =='logit_normal':
+      sigmas = sigmoid_normal_cdf(t)
+    elif schedule =='shift':
+      sigmas = logit_normal_schedule(schedule_kwargs.get('shift',3),t)
+    elif schedule == 'cosine':
+        sigmas = cosine_schedule(t)
+    else:
+      sigmas = t
+    sigmas = sigmas.to(mask_num.device)
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64)
+    for i in range(mask_num.size(0)):
+      # print(sigmas.shape)
+      sigmas_sample = (sigmas*mask_num[i]).to(torch.int64)
+      # print(sigmas_sample)
+      sigmas_sample = sigmas_sample[1:]-sigmas_sample[:-1]
+      # print(sigmas_sample)
+      # fix detal
+      sigmas_sample = torch.clamp(sigmas_sample,1,None) # should only increase
+      delta = sigmas_sample.sum() - mask_num[i]
+    #   breakpoint()
+      assert delta>=0
+      j = 0
+      while delta > 0:
+        j = j % len(sigmas_sample)
+        if sigmas_sample[j] == 1:
+          j += 1
+          continue
+        delta -= 1
+        sigmas_sample[j] -= 1
+        j += 1
+    #   breakpoint()
+      assert sigmas_sample.sum()==mask_num[i]
+      num_transfer_tokens[i] = sigmas_sample#.to(torch.int64)
+    return num_transfer_tokens.flip(-1)
+def linear(y):
+    return y
+def cosine_schedule(x):
+    """
+    Cosine schedule mapping [0, 1] -> [1, 0]
+    """
+    x = np.clip(x, 0, 1)
+    return 1-0.5 * (1 + np.cos(np.pi * x))
+def sigmoid_normal_cdf(y):
+    # y must be in (0, 1)
+    logit_y = torch.log(y / (1 - y))
+    return 0.5 * (1 + torch.erf(logit_y / torch.sqrt(torch.tensor(2.0))))
+def logit_normal_schedule(shift,sigmas):
+    # shift = 1 / shift
+    sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+    return sigmas
+import os
+DEBUG_PRINT_OUTPUT = os.environ.get('DEBUG_PRINT_OUTPUT',False)
+@ torch.no_grad()
+def generate(model, prompt=None, steps=None, max_new_tokens=128, block_length=128, temperature=0.,
+             cfg_scale=0., remasking='low_confidence', mask_id=126336,inputs_embeds=None, position_ids=None,attention_mask=None,
+              tokenizer=None,
+                verbose=False,
+                step_per_block=None,
+                prefix_lm=False,
+                schedule=None,
+                schedule_kwargs=None,
+                draft_tokens=None,
+                step_ratio=None,
+             **kwargs):
+    '''
+    Args:
+        model: Mask predictor.
+        prompt: A tensor of shape (1, L).
+        steps: Sampling steps, less than or equal to gen_length.
+        gen_length: Generated answer length.
+        block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
+        temperature: Categorical distribution sampling temperature.
+        cfg_scale: Unsupervised classifier-free guidance scale.
+        remasking: Remasking strategy. 'low_confidence' or 'random'.
+        mask_id: The toke id of [MASK] is 126336.
+    '''
+    # breakpoint()
+    # remasking =
+    # step_ratio = 0.5
+    # block_length = 1024
+    # steps = 1024
+    steps = max_new_tokens # min(steps,max_new_tokens)
+    # if step_ratio:
+    #     steps = int(max_new_tokens*step_ratio)
+    gen_length = max_new_tokens
+    assert position_ids is None
+    if prompt is None:
+        assert inputs_embeds is not None
+        bsz, seq_len = inputs_embeds.shape[:2]
+        prompt = torch.full((bsz, seq_len), 0, dtype=torch.long).to(model.device)
+    past_key_values = None
+    if prefix_lm:
+        past_key_values = model(None,input_embeddings=inputs_embeds,use_cache=True).attn_key_values
+        # breakpoint()
+        x = torch.full((1, gen_length), mask_id, dtype=torch.long).to(model.device)
+        prompt = torch.full((bsz, 0), 0, dtype=torch.long).to(model.device)
+        # x[:, :prompt.shape[1]] = prompt.clone()
+    else:
+        x = torch.full((1, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(model.device)
+        x[:, :prompt.shape[1]] = prompt.clone()
+    prompt_index = (x != mask_id)
+    assert prompt.shape[0] == 1
+    if draft_tokens is not None:
+        assert draft_tokens.shape[1] <= gen_length
+        x[:, prompt.shape[1]:prompt.shape[1]+draft_tokens.shape[1]] = draft_tokens.clone()
+    # if block_length < gen_length:
+    #    block_length = gen_length
+    assert gen_length % block_length == 0
+    num_blocks = gen_length // block_length
+    assert ( steps % num_blocks == 0) or step_per_block is not None
+    steps = steps // num_blocks
+    if step_per_block:
+        steps = min(step_per_block,block_length)
+        assert step_ratio is None, 'Please do not pass both step_ratio and step_per_block'
+    # step_ratio = 0.5
+    # schedule = 'shift'
+    # schedule_kwargs = dict(shift=3)
+    # breakpoint()
+    if step_ratio:
+        steps = int(steps*step_ratio)
+    # print(steps,step_per_block,block_length,draft_tokens.shape[-1])
+    # NFE = 0
+    if verbose:
+        history = []
+    for num_block in range(num_blocks):
+        block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id)
+        num_transfer_tokens = get_num_transfer_tokens_sch(block_mask_index, steps,schedule=schedule,schedule_kwargs=schedule_kwargs)
+        if DEBUG_PRINT_OUTPUT:
+            print(f"Block: {num_block + 1}/{num_blocks}, Steps per Block: {steps}, Block Length: {block_length}")
+            print(f"Tokens generated per step {num_transfer_tokens[0]}")
+        for i in range(steps):
+            # print(i)
+            mask_index = (x == mask_id)
+            # print(mask_index.sum())
+            if mask_index.sum() == 0:
+                continue
+            # NFE += 2
+            if cfg_scale > 0.:
+                assert NotImplementedError('cfg_scale > 0. is not supported.')
+                un_x = x.clone()
+                un_x[prompt_index] = mask_id
+                x_ = torch.cat([x, un_x], dim=0)
+                #
+                logits = model(x_,input_embeds_inference=[inputs_embeds,None]).logits
+                logits, un_logits = torch.chunk(logits, 2, dim=0)
+                logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+            else:
+                inputs_embeds_curr = model.transformer.wte(x)
+                #print(tokenizer.batch_decode(x)[0].replace('<|endoftext|>',''))
+                # print((x==mask_id).sum())
+                # breakpoint()
+                if prefix_lm:
+                    # breakpoint()
+                    logits = model(None,input_embeddings=inputs_embeds_curr,past_key_values=past_key_values).logits
+                else:
+                    if inputs_embeds is not None:
+                        inputs_embeds_curr[:,:inputs_embeds.shape[1]] = inputs_embeds
+                    logits = model(None,input_embeddings=inputs_embeds_curr).logits
+            # logits = logits.cpu()
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0 = torch.argmax(logits_with_noise, dim=-1) # b, l
+            # torch.cuda.empty_cache()
+            # torch.cuda.synchronize()
+            if remasking == 'low_confidence':
+                p = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_p = torch.squeeze(
+                    torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+            elif remasking == 'random':
+                x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+            elif remasking == 'entrophy':
+                epsilon = 1e-10
+                probs = F.softmax(logits.to(torch.float64), dim=-1)
+                log_probs = torch.log(probs + epsilon)
+                x0_p = torch.sum(probs * log_probs, dim=-1)
+            elif remasking == 'margin':
+                ## similar to margin algo in Dream
+                p = F.softmax(logits.to(torch.float64), dim=-1)
+                sorted_probs, _ = torch.sort(p, dim=-1, descending=True)
+                top1_probs = sorted_probs[:, :, 0]
+                top2_probs = sorted_probs[:, :, 1]
+                x0_p = top1_probs - top2_probs
+            else:
+                raise NotImplementedError(remasking)
+            x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf
+            x0 = torch.where(mask_index, x0, x)
+            confidence = torch.where(mask_index, x0_p, -np.inf)
+            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+            for j in range(confidence.shape[0]):
+                _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
+                transfer_index[j, select_index] = True
+            x[transfer_index] = x0[transfer_index]
+            if verbose:
+                history.append(x.clone().cpu())
+    # breakpoint()
+    # print(f"NFE: {NFE} Num Blocks: {num_blocks}")
+    if verbose:
+        return x,history
+    return x
+def main():
+    device = 'cuda'
+    model = AutoModel.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True, torch_dtype=torch.bfloat16).to(device).eval()
+    tokenizer = AutoTokenizer.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True)
+    prompt = "Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours?"
+    # Add special tokens for the Instruct model. The Base model does not require the following two lines.
+    m = [{"role": "user", "content": prompt}, ]
+    prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+    input_ids = tokenizer(prompt)['input_ids']
+    input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
+    out = generate(model, input_ids, steps=128, gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
+    print(tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
+    generate(model, input_ids, steps=128, gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
+if __name__ == '__main__':
+    main()

generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_from_model_config": true,
+  "alg": "origin",
+  "alg_temp": null,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "eps": 0.001,
+  "mask_token_id": null,
+  "output_history": false,
+  "pad_token_id": 151643,
+  "steps": 512,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "transformers_version": "4.51.3"
+}

generation_utils.py ADDED Viewed

	@@ -0,0 +1,706 @@

+# coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.distributions as dists
+from torch.nn import functional as F
+from transformers import __version__
+from transformers.generation.configuration_utils import (
+    GenerationConfig
+)
+from transformers.utils import (
+    ModelOutput,
+    is_torchdynamo_compiling,
+    logging,
+)
+from .generate_from_llada import  get_num_transfer_tokens_sch
+logger = logging.get_logger(__name__)
+import sys
+import pdb
+class ForkedPdb(pdb.Pdb):
+    """
+    PDB Subclass for debugging multi-processed code
+    Suggested in: https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess
+    """
+    def interaction(self, *args, **kwargs):
+        _stdin = sys.stdin
+        try:
+            sys.stdin = open('/dev/stdin')
+            pdb.Pdb.interaction(self, *args, **kwargs)
+        finally:
+            sys.stdin = _stdin
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+# def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+#     if temperature > 0:
+#         logits = logits / temperature
+#     if top_p is not None and top_p < 1:
+#         logits = top_p_logits(logits, top_p)
+#     if top_k is not None:
+#         logits = top_k_logits(logits, top_k)
+#     probs = torch.softmax(logits, dim=-1)
+#     if temperature > 0:
+#         try:
+#             x0 = dists.Categorical(probs=probs).sample()
+#             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+#         except:
+#             confidence, x0 = probs.max(dim=-1)
+#     else:
+#         confidence, x0 = probs.max(dim=-1)
+#     if margin_confidence:
+#         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+#         # Extract top1 and top2 probabilities
+#         top1_probs = sorted_probs[:, 0]
+#         top2_probs = sorted_probs[:, 1]
+#         # Calculate confidence as top1 - top2
+#         confidence = top1_probs - top2_probs
+#     if neg_entropy:
+#         epsilon = 1e-10
+#         log_probs = torch.log(probs + epsilon)
+#         confidence = torch.sum(probs * log_probs, dim=-1)
+#     return confidence, x0
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    """
+    从给定的 logits 中采样或贪心选取 token，并返回置信度和 token ID。
+    参数：
+        logits (Tensor)：形状 [batch_size, vocab_size]，模型对各候选 token 的打分（未经 softmax）。
+        temperature (float)：温度系数，默认 0.0。>0 时按概率采样，=0 时贪心选取。
+        top_p (float 或 None)：核采样参数（nucleus sampling），若指定且 <1，只保留累计概率前 top_p 的 token。
+        top_k (int 或 None)：前 k 采样参数（top-k sampling），若指定，只从概率最高的 k 个 token 中选取。
+        margin_confidence (bool)：是否使用 top1−top2 之差作为置信度，默认 False。
+        neg_entropy (bool)：是否使用负熵（−∑p·logp）作为置信度，默认 False。
+    返回：
+        confidence (Tensor)：形状 [batch_size] 的置信度值（可用概率、margin 差值或负熵）。
+        x0 (Tensor)：形状 [batch_size] 的 int64 张量，表示采样或贪心得到的 token ID。
+    """
+    # ======================================================
+    # 1. 温度缩放 (Temperature Scaling)
+    # ======================================================
+    if temperature > 0:
+        # 当 temperature>0 时，将 logits 除以 temperature，使得 softmax 分布更平滑或更尖锐
+        logits = logits / temperature
+    # ======================================================
+    # 2. Top-p (Nucleus) 与 Top-k 过滤
+    # ======================================================
+    if top_p is not None and top_p < 1:
+        # 调用 top_p_logits，保留累计概率达到 top_p 的 token，其它 logits 置为很小的负值
+        logits = top_p_logits(logits, top_p)
+    if top_k is not None:
+        # 调用 top_k_logits，仅保留概率最高的 top_k 个 token，其它 logits 置为很小的负值
+        logits = top_k_logits(logits, top_k)
+    # ======================================================
+    # 3. 计算概率分布 (Softmax)
+    # ======================================================
+    probs = torch.softmax(logits, dim=-1)
+    # 此时 probs 形状为 [batch_size, vocab_size]，每行和为 1
+    # ======================================================
+    # 4. 根据 temperature 决定采样或贪心选取
+    # ======================================================
+    if temperature > 0:
+        # 随机采样分支：从 Categorical 分布中采样 token
+        try:
+            # 从多项分布中采样得到 token ID，形状 [batch_size]
+            x0 = dists.Categorical(probs=probs).sample()
+            # 用 gather 取出对应位置的概率值作为置信度，形状 [batch_size]
+            confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except:
+            # 若采样出错（如概率分布不合法），退化为贪心选取
+            confidence, x0 = probs.max(dim=-1)
+    else:
+        # 当 temperature=0 时，直接贪心选取概率最大的 token
+        confidence, x0 = probs.max(dim=-1)
+    # ======================================================
+    # 5. margin_confidence: 使用 top1−top2 差值作为置信度
+    # ======================================================
+    if margin_confidence:
+        # 将每行概率按降序排序，sorted_probs[:,0] 为 top1，sorted_probs[:,1] 为 top2
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
+        # 置信度设为 top1_probs − top2_probs
+        confidence = top1_probs - top2_probs
+    # ======================================================
+    # 6. neg_entropy: 使用负熵（−∑ p·log p）作为置信度
+    # ======================================================
+    if neg_entropy:
+        epsilon = 1e-10
+        # 为避免 log(0) 产生 −inf，加上一个小常数 epsilon
+        log_probs = torch.log(probs + epsilon)
+        # 计算 ∑ p_i * log p_i，结果是负熵值（值越接近 0，表示分布更“尖锐”）
+        confidence = torch.sum(probs * log_probs, dim=-1)
+    return confidence, x0
+@dataclass
+class DreamModelOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    history: Optional[Tuple[torch.FloatTensor]] = None
+class DreamGenerationConfig(GenerationConfig):
+    def __init__(self, **kwargs):
+        self.temperature: float = kwargs.pop("temperature", 0.0)
+        self.top_p: Optional[float] = kwargs.pop("top_p", None)
+        self.top_k: Optional[int] = kwargs.pop("top_k", None)
+        self.max_length = kwargs.pop("max_length", 20)
+        self.max_new_tokens = kwargs.pop("max_new_tokens", None)
+        # diffusion specific params
+        self.eps: float = kwargs.pop("eps", 1e-3)
+        self.steps: int = kwargs.pop("steps", 512)
+        self.alg: str = kwargs.pop("alg", 'origin')
+        self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
+        # Parameters that define the output variables of `generate`
+        self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
+        self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
+        self.output_history: bool = kwargs.pop("output_history", False)
+        # Special tokens that can be used at generation time
+        self.mask_token_id = kwargs.pop("mask_token_id", None)
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+        # Wild card
+        self.generation_kwargs = kwargs.pop("generation_kwargs", {})
+        # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
+        # interface.
+        self._from_model_config = kwargs.pop("_from_model_config", False)
+        self._commit_hash = kwargs.pop("_commit_hash", None)
+        self.transformers_version = kwargs.pop("transformers_version", __version__)
+        # Additional attributes without default values
+        if not self._from_model_config:
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
+            for key, value in kwargs.items():
+                try:
+                    setattr(self, key, value)
+                except AttributeError as err:
+                    logger.error(f"Can't set {key} with value {value} for {self}")
+                    raise err
+        # Validate the values of the attributes
+        self.validate(is_init=True)
+    def validate(self, is_init=False):
+        pass
+class DreamGenerationMixin:
+    @staticmethod
+    def _expand_inputs_for_generation(
+        expand_size: int = 1,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        # Do not call torch.repeat_interleave if expand_size is 1 because it clones
+        # the input tensor and thus requires more memory although no change is applied
+        if expand_size == 1:
+            return input_ids, attention_mask
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        if attention_mask is not None:
+            attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
+        return input_ids, attention_mask
+    def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
+        """Performs validation related to the resulting generated length"""
+        # Can't throw warnings/exceptions during compilation
+        if is_torchdynamo_compiling():
+            return
+        # 1. Max length warnings related to poor parameterization
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
+                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
+                "generation.",
+                UserWarning,
+            )
+        if input_ids_length >= generation_config.max_length:
+            input_ids_string = "input_ids"
+            raise ValueError(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_length` or, better yet, setting `max_new_tokens`."
+            )
+    def _prepare_generated_length(
+        self,
+        generation_config,
+        has_default_max_length,
+        input_ids_length,
+    ):
+        """Prepared max and min length in generation configs to avoid clashes between similar attributes"""
+        if generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+        elif has_default_max_length:
+            if generation_config.max_length == DreamGenerationConfig().max_length:
+                generation_config.max_length = generation_config.max_length + input_ids_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+        return generation_config
+    def _prepare_generation_config(
+        self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict
+    ) -> DreamGenerationConfig:
+        """
+        Prepares the base generation config, then applies any generation configuration options from kwargs. This
+        function handles retrocompatibility with respect to configuration files.
+        """
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+        using_model_generation_config = False
+        if generation_config is None:
+            generation_config = DreamGenerationConfig.from_model_config(self.config)
+            using_model_generation_config = True
+        # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config`
+        # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled -- an
+        # exception will be raised in `_validate_model_kwargs`
+        if not is_torchdynamo_compiling():
+            generation_config = copy.deepcopy(generation_config)
+            _kwargs = generation_config.update(**kwargs)
+            # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
+            if not using_model_generation_config:
+                if generation_config.bos_token_id is None:
+                    generation_config.bos_token_id = self.generation_config.bos_token_id
+                if generation_config.eos_token_id is None:
+                    generation_config.eos_token_id = self.generation_config.eos_token_id
+                if generation_config.pad_token_id is None:
+                    generation_config.pad_token_id = self.generation_config.pad_token_id
+                if generation_config.mask_token_id is None:
+                    generation_config.mask_token_id = self.generation_config.mask_token_id
+        return generation_config
+    def _prepare_special_tokens(
+        self,
+        generation_config: DreamGenerationConfig,
+        device: Optional[Union[torch.device, str]] = None,
+    ):
+        """
+        Prepares the special tokens for generation, overwriting the generation config with their processed versions
+        converted to tensor.
+        Note that `generation_config` is changed in place and stops being serializable after this method is called.
+        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
+        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
+        """
+        # Convert special tokens to tensors
+        def _tensor_or_none(token, device=None):
+            if token is None:
+                return token
+            device = device if device is not None else self.device
+            if isinstance(token, torch.Tensor):
+                return token.to(device)
+            return torch.tensor(token, device=device, dtype=torch.long)
+        bos_token_tensor = _tensor_or_none(generation_config.bos_token_id, device=device)
+        eos_token_tensor = _tensor_or_none(generation_config.eos_token_id, device=device)
+        pad_token_tensor = _tensor_or_none(generation_config.pad_token_id, device=device)
+        mask_token_tensor = _tensor_or_none(generation_config.mask_token_id, device=device)
+        # We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
+        if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
+            eos_token_tensor = eos_token_tensor.unsqueeze(0)
+        # Set pad token if unset (and there are conditions to do so)
+        if pad_token_tensor is None and eos_token_tensor is not None:
+            pad_token_tensor = eos_token_tensor[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
+        # Update generation config with the updated special tokens tensors
+        # NOTE: this must be written into a different attribute name than the one holding the original special tokens
+        # (in their non-tensor form), in order to enable end-to-end compilation. See
+        # https://pytorch.org/docs/stable/torch.compiler_cudagraph_trees.html#limitations
+        generation_config._bos_token_tensor = bos_token_tensor
+        generation_config._eos_token_tensor = eos_token_tensor
+        generation_config._pad_token_tensor = pad_token_tensor
+        generation_config._mask_token_tensor = mask_token_tensor
+    @torch.no_grad()
+    def diffusion_generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[DreamGenerationConfig] = None,
+        inputs_embeds=None,
+        prefix_lm=False,
+        **kwargs,
+    ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        generation_config = self._prepare_generation_config(generation_config, **kwargs)
+        generation_tokens_hook_func = kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
+        generation_logits_hook_func = kwargs.pop("generation_logits_hook_func", lambda step, x, logits: logits)
+        # breakpoint()
+        # 2. Define model inputs
+        # import pdb;pdb.set_trace()
+        if inputs is not None:
+            input_ids = inputs
+            device = input_ids.device
+            input_ids_length = input_ids.shape[-1]
+        else:
+            input_ids = None
+            device = inputs_embeds.device
+            input_ids_length = inputs_embeds.shape[1]
+        attention_mask = kwargs.pop("attention_mask", None)
+        self._prepare_special_tokens(generation_config, device=device)
+        # 3. Prepare `max_length`.
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            input_ids_length=input_ids_length,
+        )
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
+        # import pdb;pdb.set_trace()
+        # 4. Check input_ids
+        #if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
+        if not is_torchdynamo_compiling() and self.device.type != device.type:
+            warnings.warn(
+                "You are calling .generate() with the `input_ids` being on a device type different"
+                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+                " Please make sure that you have put `input_ids` to the"
+                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+                " running `.generate()`.",
+                UserWarning,
+            )
+        # breakpoint()
+        if (
+            hasattr(generation_config, "pad_token_id") and
+            input_ids is not None and
+            torch.any(input_ids == generation_config.pad_token_id) and
+            attention_mask is None
+        ):
+            warnings.warn(
+                "Padding was detected but no attention mask is passed here. For correct "
+                "generation results, please set `attention_mask` when batch-padding inputs.",
+                UserWarning,
+            )
+        assert generation_config.num_return_sequences == 1, "Currently, we only support num_return_sequences = 1 for diffusion generation."
+        # import pdb;pdb.set_trace()
+        input_ids, attention_mask = self._expand_inputs_for_generation(
+            expand_size=generation_config.num_return_sequences,
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        result = self._sample(
+            input_ids,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            generation_tokens_hook_func=generation_tokens_hook_func,
+            generation_logits_hook_func=generation_logits_hook_func,
+            inputs_embeds=inputs_embeds,
+            device=device,
+            prefix_lm=prefix_lm,
+            **kwargs,
+        )
+        return result
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor],
+        generation_config: DreamGenerationConfig,
+        generation_tokens_hook_func,
+        generation_logits_hook_func,
+        inputs_embeds=None,
+        prefix_lm=False,
+        device=None,
+        schedule_kwargs=None,
+        schedule=None,
+        step_ratio=None,
+        **kwargs,
+    ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # 1. 从 generation_config 中提取常用参数
+        output_history = generation_config.output_history            # 是否保存每一步的中间结果
+        # output_history = True
+        return_dict_in_generate = generation_config.return_dict_in_generate  # 生成时是否返回字典形式
+        max_length = generation_config.max_length                     # 生成后序列的最大长度（包括前缀）
+        mask_token_id = generation_config.mask_token_id               # [MASK] 的 token ID
+        max_new_tokens = generation_config.max_new_tokens             # 最多新增的 token 数量
+        steps = min(generation_config.steps, max_new_tokens)          # 实际去噪步数，不能超过最大新增 token 数
+        eps = generation_config.eps                                    # 噪声下限，用于时刻表
+        alg = generation_config.alg                                    # 选择的去噪算法（'origin'/ 'maskgit_plus'/ 'topk_margin'/ 'entropy'）
+        alg_temp = generation_config.alg_temp                          # 针对某些算法（margin/entropy）调整置信度的温度参数
+        temperature = generation_config.temperature                    # 采样时的温度
+        top_p = generation_config.top_p                                # top-p 截断采样参数
+        top_k = generation_config.top_k                                # top-k 截断采样参数
+        # histories 用于保存每一步的 x，如果需要返回历史则初始化为列表，否则为 None
+        histories = [] if (return_dict_in_generate and output_history) else None
+        # 2. 如果没有传入 input_ids，而是直接传了 inputs_embeds，就根据 inputs_embeds 构造一个 placeholder 的 input_ids
+        if input_ids is None:
+            assert device is not None
+            assert inputs_embeds is not None
+            bsz, seq_len = inputs_embeds.shape[:2]                   # batch size 和前缀长度
+            max_length = seq_len + max_new_tokens                     # 重新计算 max_length
+            # 创建一个全 0 的张量作为占位，后续会把 embedding 覆盖回去
+            input_ids = torch.full((bsz, seq_len), 0, dtype=torch.long).to(device)
+        # tok_idx 和 past_key_values 暂时留空，后面 prefix_lm 分支会用到
+        tok_idx = None
+        past_key_values = None
+        # 3. 把 input_ids pad 到 max_length，后面补 [MASK]
+        #    F.pad 的 (0, L) 表示在右侧 pad 长度为 (max_length - seq_len)，值为 mask_token_id
+        # import pdb;pdb.set_trace()
+        x = F.pad(input_ids, (0, max_length - input_ids.shape[1]), value=mask_token_id)  # 生成初始的 […, MASK, MASK, …]
+        # 4. 如果启用 prefix_lm 模式，先用 inputs_embeds 做一次常规模型前缀推理，得到 past_key_values 和首个 token
+        if prefix_lm:
+            dtype = inputs_embeds.dtype
+            # 先做一次前缀推理，use_cache=True 以获取 past_key_values
+            prefill = self.forward_dream(
+                None, attention_mask, tok_idx,
+                inputs_embeds=inputs_embeds.to(dtype),
+                use_cache=True
+            )
+            past_key_values = prefill.past_key_values
+            # 把前缀阶段模型最后一步的预测 token 取出，作为去噪的第一个位置
+            first_token = prefill.logits[:, -1:].argmax(dim=-1)  # 形状为 [B, 1]
+            # 只保留 mask 区域（原 x 的 right half）
+            x = x[:, input_ids.shape[1]:]                       # 形状 [B, max_new_tokens]
+            # 把 mask 区域第一位填为 first_token
+            x[:, :1] = first_token
+        #. prefill['logits'].shape.  torch.Size([1, 1063, 151667]) 即输入是这个
+        # 5. 当前不支持带 attention_mask 的情形，断言确保 attention_mask 一定为 None
+        assert attention_mask is None
+        # 6. 构造去噪时刻表 timesteps，线性从 1 -> eps，共 (steps + 1) 个值
+        #    timesteps[i] 对应上一步噪声权重，timesteps[i+1] 对应本步噪声权重
+        timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
+        # import pdb;pdb.set_trace()
+        # 7. 给用户一个机会在第 0 步“初始 x”阶段插入自定义逻辑
+        x = generation_tokens_hook_func(None, x, None)
+        # 8. 如果用户指定 step_ratio，就根据比例重计算步数
+        if step_ratio is not None:
+            steps = int(max_new_tokens * step_ratio)
+        # 9. 计算每一步要去噪多少个 mask（如果传了 schedule，就用自定义调度）
+        if schedule is None:
+            sch = None
+        else:
+            # get_num_transfer_tokens_sch 返回形状 [B, steps] 的矩阵
+            sch = get_num_transfer_tokens_sch((x == mask_token_id), steps, schedule, schedule_kwargs)
+        # 10. 进入去噪主循环
+        for i in range(steps):
+            # 10.1 找出当前仍是 [MASK] 的位置，mask_index 为布尔矩阵 [B, current_length]
+            mask_index = (x == mask_token_id)
+            # 10.2 先把 x 转成 embedding，得到形状 [B, current_length, D]
+            inputs_embeds_curr = self.model.embed_tokens(x)
+            # 10.3 如果非 prefix_lm，且外部传入了 inputs_embeds，则把前缀部分覆盖回去
+            if not prefix_lm:
+                if inputs_embeds is not None:
+                    inputs_embeds_curr[:, :inputs_embeds.shape[1]] = inputs_embeds
+                # 用当前 embedding 做一次前向，得到 logits，形状 [B, current_length, V]
+                logits = self.forward_dream(None, attention_mask, tok_idx, inputs_embeds=inputs_embeds_curr).logits
+                # 把 logits 拼接成对齐当前预测：logits[:,1:] 对齐到 x[:, :-1]
+                logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
+            else:
+                # prefix_lm 模式，用 past_key_values 加速推理
+                logits = self.forward_dream(
+                    None, attention_mask, tok_idx,
+                    inputs_embeds=inputs_embeds_curr,
+                    past_key_values=past_key_values
+                ).logits
+                logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
+            # 10.4 用户自定义 logits 钩子，可以修改 logits 分布
+            # import pdb;pdb.set_trace()
+            logits = generation_logits_hook_func(i, x, logits)
+            # 10.5 取出当前所有 [MASK] 位置对应的 logits，形状 [num_mask, V]
+            mask_logits = logits[mask_index]
+            # 10.6 从 timesteps 中取出噪声权重 t, s
+            t = timesteps[i]
+            s = timesteps[i + 1]
+            # 10.7 根据不同算法决定本轮去噪逻辑
+            if alg == 'origin':
+                # 基础扩散算法：按概率 p_transfer 随机把一部分 mask 位置替换成 token
+                p_transfer = 1 - s / t if i < steps - 1 else 1  # 最后一轮保证把所有剩余 mask 都去掉
+                # x0 临时占位，全填 mask
+                x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
+                # 随机采样哪些位置在本轮去噪：如果 torch.rand < p_transfer 就先去噪
+                transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
+                # 对这些选中的位置，从 mask_logits 中采样真实 token
+                _, x0[transfer_index_t_s] = sample_tokens(
+                    mask_logits[transfer_index_t_s],
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k
+                )
+                # 更新 x：只替换 mask_index 位置
+                x[mask_index] = x0.clone()
+            else:
+                # MaskGIT+ / Top-K Margin / Entropy 算法
+                if alg == 'maskgit_plus':
+                    # 返回 confidence（置信度）和 x0（最可能的 token ID）
+                    confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
+                elif alg == 'topk_margin':
+                    confidence, x0 = sample_tokens(
+                        mask_logits,
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        margin_confidence=True
+                    )
+                elif alg == 'entropy':
+                    confidence, x0 = sample_tokens(
+                        mask_logits,
+                        temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        neg_entropy=True
+                    )
+                else:
+                    raise RuntimeError(f"Unknown alg: {alg}")
+                # 当前还有多少 mask 位置
+                num_mask_token = mask_index.sum()
+                # 根据 schedule（或默认比例）决定本轮要去噪多少个
+                if sch is not None:
+                    number_transfer_tokens = sch[0, i]
+                else:
+                    number_transfer_tokens = int(num_mask_token * (1 - s / t)) if i < steps - 1 else num_mask_token
+                if number_transfer_tokens > 0:
+                    if alg_temp is None or alg_temp == 0:
+                        # 直接选置信度最高的 number_transfer_tokens 个位置
+                        _, transfer_index = torch.topk(confidence, number_transfer_tokens)
+                    else:
+                        # 用温度调节 confidence，再按多项式采样 number_transfer_tokens 个
+                        confidence = confidence / alg_temp
+                        confidence = F.softmax(confidence, dim=-1)
+                        transfer_index = torch.multinomial(confidence, num_samples=number_transfer_tokens)
+                    # x0_ 临时占位，全填 mask
+                    x0_ = torch.zeros_like(x0, device=self.device, dtype=torch.long) + mask_token_id
+                    # 在选中的位置填入从 x0 (argmax token) 中取得的 token
+                    x0_[transfer_index] = x0[transfer_index].clone()
+                    # 更新 x：只替换 mask_index 位置
+                    x[mask_index] = x0_
+                    #如果出现的token有 151643（eos) ,那么他后面的所有都换成 151643，不需要再次mask
+                    SPECIAL_TOKEN_ID = 151643
+                    if (x == SPECIAL_TOKEN_ID).any():
+                        # 对每个 batch 处理
+                        for b in range(x.shape[0]):
+                            row = x[b]
+                            # 找到第一个出现 SPECIAL_TOKEN_ID 的位置
+                            idx = (row == SPECIAL_TOKEN_ID).nonzero(as_tuple=True)[0]
+                            if len(idx) > 0:
+                                first_idx = idx[0].item()
+                                # 该位置及其后面全部赋值为 SPECIAL_TOKEN_ID
+                                row[first_idx:] = SPECIAL_TOKEN_ID
+                                x[b] = row
+            # 10.8 用户自定义 token 钩子：对本轮更新后的 x 做额外处理
+            x = generation_tokens_hook_func(i, x, logits)
+            # 10.9 如果需要保存历史，就把当前 x clone 一份放进去
+            if histories is not None:
+                histories.append(x.clone())
+        # ForkedPdb().set_trace()
+        # 11. 循环结束后，根据 return_dict_in_generate 决定返回形式
+        if return_dict_in_generate:
+            return DreamModelOutput(
+                sequences=x,     # 最终生成的完整 token 序列 [B, max_length]
+                history=histories,  # 如果启用，会包含每一���的 x
+            )
+        else:
+            return x  # 只返回最终序列 [B, max_length]

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b35dfdb5e496fd487d09b57b993bfd42726f8883d8b248c72a9c3e4aa623089
+size 4992396112

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87ad192520ebda1eb404f566dea689c95d2e3aa52ea7d1bef508419f2a829e18
+size 4991481280

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf26b729c709cfe7d4937b9f8361c079708e8a1bfe8ffa71e85e34e814382b49
+size 4828352366

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1110575f31d68d39ddd890f3935971a2c76d8add5e112b2aad9098d34090d43
+size 1263460480

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_dream.py ADDED Viewed

	@@ -0,0 +1,1781 @@

+# coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT and Qwen implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT and Qwen used by the Meta AI and Qwen team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Dream model."""
+from .modeling_sensevoice import AudioEncoder
+from .resampler_projector import ResamplerProjector
+import random
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+# import torch._dynamo
+# torch._dynamo.config.suppress_errors = True
+import sys
+import pdb
+class ForkedPdb(pdb.Pdb):
+    """
+    PDB Subclass for debugging multi-processed code
+    Suggested in: https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess
+    """
+    def interaction(self, *args, **kwargs):
+        _stdin = sys.stdin
+        try:
+            sys.stdin = open('/dev/stdin')
+            pdb.Pdb.interaction(self, *args, **kwargs)
+        finally:
+            sys.stdin = _stdin
+import math
+from typing import List, Optional, Tuple, Union
+import os
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import (
+    # BaseModelOutput,
+    MaskedLMOutput,
+    ModelOutput
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
+from transformers import PretrainedConfig
+from .configuration_dream import DreamConfig
+from .generation_utils import DreamGenerationMixin, DreamGenerationConfig
+from  dataclasses import  dataclass
+from typing import Any
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .modeling_sensevoice import AudioEncoder
+from .resampler_projector import ResamplerProjector
+logger = logging.get_logger(__name__)
+# def forward_process(bsz, seq_len, device, first_non_neg_idx_list, last_non_neg_idx_list, eps=1e-3):
+#     b, l = bsz, seq_len  # b → batch_size，l → 序列长度
+#     # 初始化掩码输出
+#     masked_indices = torch.zeros((b, l), device=device, dtype=torch.bool)
+#     p_mask = torch.rand(b, device=device)  # shape: [b]，生成一个随机数作为掩码比例
+#     # 映射到 (eps, 1) 区间，保证最小值不低于 eps
+#     p_mask = (1 - eps) * p_mask + eps     # shape: [b]
+#     p_mask = p_mask[:, None]              # shape: [b, 1]，方便广播
+#     # 针对每个样本的有效部分生成掩码
+#     for i in range(b):
+#         first_non_neg_idx = first_non_neg_idx_list[i]
+#         last_non_neg_idx = last_non_neg_idx_list[i]
+#         # 如果无有效区间，跳过
+#         if first_non_neg_idx is None or last_non_neg_idx is None:
+#             continue
+#         valid_length = last_non_neg_idx - first_non_neg_idx + 1
+#         if valid_length <= 0:
+#             continue
+#         # 生成当前样本的掩码概率阈值
+#         t = torch.rand(valid_length, device=device)  # shape: [valid_length]
+#         mask_threshold = (1 - eps) * t + eps  # 计算该样本的掩码概率
+#         # 计算该样本掩码的上限
+#         mask_cutoff = torch.max(mask_threshold, torch.min(torch.rand(valid_length, device=device)))  # shape: [valid_length]
+#         # 在有效部分生成掩码
+#         masked_indices[i, first_non_neg_idx:last_non_neg_idx+1] = torch.rand(valid_length, device=device) <= mask_cutoff
+#     return masked_indices, p_mask
+def forward_process(
+    bsz: int,
+    seq_len: int,
+    device: torch.device,
+    labels: torch.Tensor,                # [b, l] 的标签 tensor
+    eps: float = 1e-3,
+    special_token_id: int = 151643,      # 要“优待”的 token id
+    special_mask_ratio: float = 0.1      # special token 只按原阈值的 10% 掩
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    生成掩码，并打印统计：
+      - 总共被掩的 token 数
+      - special_token_id 被掩的数量
+    参数：
+      - bsz: batch size
+      - seq_len: 序列长度
+      - device: torch 设备
+      - labels: [b, l] 的标签 tensor（-100 表示无效位置）
+      - eps: 阈值下限
+      - special_token_id: 要降低掩码率的特殊 token id
+      - special_mask_ratio: special token 的掩码率缩放因子
+    返回：
+      - masked_indices: [b, l] 的 bool 掩码矩阵
+      - p_mask:           [b, 1] 每个样本的整体掩码比例
+    """
+    b, l = bsz, seq_len
+    # 初始化掩码矩阵 & 每个样本的整体掩码比例
+    masked_indices = torch.zeros((b, l), device=device, dtype=torch.bool)
+    p_mask = torch.rand(b, device=device)
+    p_mask = (1 - eps) * p_mask + eps
+    p_mask = p_mask.unsqueeze(1)  # [b,1]
+    # 先为每条序列计算第一个和最后一个非 -100 的位置
+    first_idxs = []
+    last_idxs  = []
+    for i in range(b):
+        nonneg = (labels[i] != -100).nonzero(as_tuple=True)[0]
+        if nonneg.numel() == 0:
+            first_idxs.append(None)
+            last_idxs.append(None)
+        else:
+            first_idxs.append(int(nonneg[0]))
+            last_idxs.append(int(nonneg[-1]))
+    # 针对每条序列的有效区间生成掩码
+    for i in range(b):
+        start = first_idxs[i]
+        end   = last_idxs[i]
+        if start is None or end is None or end < start:
+            continue
+        valid_len = end - start + 1
+        # 为每个位置生成基础阈值
+        t = torch.rand(valid_len, device=device)
+        mask_threshold = (1 - eps) * t + eps  # [valid_len]
+        # 生成随机判定值
+        rand_vals = torch.rand(valid_len, device=device)
+        # 普通 token 的掩码决定
+        normal_mask = rand_vals <= mask_threshold
+        # special token 的掩码阈值更低
+        special_thresh = mask_threshold * special_mask_ratio
+        special_mask   = rand_vals <= special_thresh
+        labels_slice = labels[i, start : end + 1]
+        # 最终掩码：特殊 token 用 special_mask，其他用 normal_mask
+        final_mask = torch.where(
+            labels_slice == special_token_id,
+            special_mask,
+            normal_mask
+        )
+        masked_indices[i, start : end + 1] = final_mask
+    # 打印统计信息
+    total_masked   = int(masked_indices.sum().item())
+    special_masked = int((masked_indices & (labels == special_token_id)).sum().item())
+    # print(f"Total masked tokens: {total_masked}")
+    # print(f"Special token_id={special_token_id} masked count: {special_masked}")
+    return masked_indices, p_mask
+# def forward_process(bsz, seq_len, device, eps=1e-3):
+#     b, l = bsz, seq_len          # b → batch_size，l → 序列长度
+#     # 1) 为 batch 中的每个样本生成一个 0~1 的随机数 t
+#     t = torch.rand(b, device=device)
+#     # 2) 把 t 映射到 (eps, 1) 区间，保证最小值不低于 eps
+#     #    p_mask 相当于给每个样本定一个「掩码概率阈值」
+#     p_mask = (1 - eps) * t + eps     # shape: [b]
+#     # 3) 扩展出维度 [b, 1]，方便后续广播
+#     p_mask = p_mask[:, None]         # shape: [b, 1]
+#     # 4) 针对 batch 中的每个 token 再生成一次随机数
+#     masked_indices = torch.rand((b, l), device=device)   # shape: [b, l]
+#     # 5) 计算当前样本要用的“掩码上限”：
+#     #    - masked_indices.min(-1).values  → 每个样本里随机矩阵的最小值（保证至少有一个 token 会被掩掉）
+#     #    - torch.max(p_mask, 该最小值)   → 二者取大，得到最终 cutoff
+#     mask_cutoff = torch.max(p_mask,
+#                             masked_indices.min(-1, keepdim=True).values)  # shape: [b, 1]
+#     # 6) 生成最终布尔掩码：随机值 ≤ cutoff 的 token 被置 True
+#     masked_indices = masked_indices <= mask_cutoff        # shape: [b, l]，dtype=bool
+#     # 7) （可选）把 True 位置替换成 [MASK] token（示例注释里用 126336 表示）
+#     # noisy_batch = torch.where(masked_indices, 126336, input_ids)
+#     # 返回：
+#     # masked_indices → [b, l] 的布尔矩阵，告诉你哪些 token 需要被掩码
+#     # p_mask         → [b, 1] 的阈值，记录每条样本的“目标掩码比例”
+#     return masked_indices, p_mask
+def generate_attention_mask(labels):
+    batch_size, seq_len = labels.shape
+    attention_mask = torch.zeros(batch_size, seq_len, seq_len, device=labels.device)
+    # 用于存储每个 batch 的 first_non_neg_idx 和 last_non_neg_idx
+    first_non_neg_idx_list = []
+    last_non_neg_idx_list = []
+    for i in range(batch_size):
+        label = labels[i]
+        # assert label.dtype in [torch.int64, torch.int32], f"label dtype is {label.dtype}"
+        # assert not torch.isnan(label.float()).any(), "label has NaN"
+        # assert not torch.isinf(label.float()).any(), "label has inf"
+        try:
+            non_neg_idx = (label != -100).nonzero(as_tuple=True)[0]
+        except Exception as e:
+            label_cpu = label.detach().cpu()      # 先搬到 CPU
+            print("label (unique) =", label_cpu.unique(), "shape =", label_cpu.shape)
+            print('label.device:', label.device)
+            print('label.shape:', label.shape)
+            # 先拷到 CPU 再打印
+            try:
+                print('label (cpu):', label.cpu())
+            except Exception as e2:
+                print('label.cpu() 也出错:', e2)
+            print('Exception:', e)
+            # continue
+            # continue  # 跳过这个样本
+        # assert label.dtype in [torch.int64, torch.int32], f"label dtype is {label.dtype}"
+        # assert not torch.isnan(label).any(), "label has NaN"
+        # assert not torch.isinf(label).any(), "label has inf"
+        # try:
+        #     non_neg_idx = (label != -100).nonzero(as_tuple=True)[0]
+        # except:
+        #     print('label is :',label)
+        if non_neg_idx.numel() == 0:
+            # 全是-100，无法分区，给默认值或raise
+            first_non_neg_idx = None
+            last_non_neg_idx = None
+            # 你可以选择跳过或全0/全1
+            # attention_mask[i] = 0  # 或者1
+        else:
+            first_non_neg_idx = non_neg_idx[0].item()
+            last_non_neg_idx = non_neg_idx[-1].item()
+            # 第一部分只能看到自己
+            attention_mask[i, :first_non_neg_idx, :first_non_neg_idx] = 1
+            # 第二部分能看到第一部分和自己
+            attention_mask[i, first_non_neg_idx:last_non_neg_idx + 1, :first_non_neg_idx] = 1
+            attention_mask[i, first_non_neg_idx:last_non_neg_idx + 1, first_non_neg_idx:last_non_neg_idx + 1] = 1
+            # 第三部分能看到所有部分
+            attention_mask[i, last_non_neg_idx + 1:, :] = 1
+        first_non_neg_idx_list.append(first_non_neg_idx)
+        last_non_neg_idx_list.append(last_non_neg_idx)
+    return attention_mask, first_non_neg_idx_list, last_non_neg_idx_list
+def update_labels(input_ids, labels, eos_id, max_n=20):
+    batch_size, seq_len = input_ids.shape
+    first_occurrence_indices = []
+    # 记录每个 batch 中 eos_id 首次出现的位置
+    for idx in range(batch_size):
+        eos_positions = (input_ids[idx] == eos_id).nonzero(as_tuple=True)[0]
+        if len(eos_positions) > 0:
+            first_occurrence_indices.append(eos_positions[0].item())
+        else:
+            first_occurrence_indices.append(-1)  # 如果没有 eos_id，则记录为 -1
+    # 从 first_idx 开始，按顺序选择 n 个位置来更新
+    for i in range(batch_size):
+            first_idx = first_occurrence_indices[i]
+            if first_idx == -1:
+                continue  # 跳过没有 eos 的样本
+            # 确保不会超过序列长度
+            max_possible = seq_len - first_idx
+            # 如果 max_possible==0，说明 eos 刚好在最后一个位置，也跳过
+            if max_possible <= 0:
+                continue
+            num_to_select = random.randint(1, min(max_n, max_possible))
+            selected_indices = torch.arange(first_idx, first_idx + num_to_select)
+            # 将这些位置的 labels 更新为 eos_id
+            labels[i, selected_indices] = eos_id
+    return labels
+import torch
+import random
+def update_labels_and_inputs(input_ids, labels, eos_id, max_n=20, pad_token_id=0, pad_label_id=-100):
+    batch_size, seq_len = input_ids.shape
+    input_ids = input_ids.clone()
+    labels = labels.clone()
+    new_input_ids = []
+    new_labels = []
+    for idx in range(batch_size):
+        eos_positions = (input_ids[idx] == eos_id).nonzero(as_tuple=True)[0]
+        if len(eos_positions) > 0:
+            first_idx = eos_positions[0].item()
+            cur_input_ids = input_ids[idx]
+            cur_labels = labels[idx]
+        else:
+            # 扩展 max_n 个 eos_id
+            random_max_n = random.randint(1, max_n)
+            eos_ids = torch.full((random_max_n,), eos_id, device=input_ids.device, dtype=input_ids.dtype)
+            cur_input_ids = torch.cat([input_ids[idx], eos_ids])
+            # labels 扩展 max_n 个 pad_label_id
+            pad_labels = torch.full((random_max_n,), eos_id, device=labels.device, dtype=labels.dtype)
+            cur_labels = torch.cat([labels[idx], pad_labels])
+            # first_idx = len(cur_input_ids) - random_max_n
+        new_input_ids.append(cur_input_ids)
+        new_labels.append(cur_labels)
+    # pad到同一长度
+    max_len = max(len(x) for x in new_input_ids)
+    padded_input_ids = torch.stack([
+        torch.cat([x, torch.full((max_len - len(x),), pad_token_id, device=x.device, dtype=x.dtype)])
+        for x in new_input_ids
+    ])
+    padded_labels = torch.stack([
+        torch.cat([x, torch.full((max_len - len(x),), pad_label_id, device=x.device, dtype=x.dtype)])
+        for x in new_labels
+    ])
+    return padded_input_ids, padded_labels
+@dataclass
+class MaskedLMOutput(ModelOutput):
+    """
+    Base class for masked language models outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Masked language modeling (MLM) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    # loss_tqa: Optional[torch.FloatTensor] = None
+    # loss_sqa: Optional[torch.FloatTensor] = None
+    # loss_asr: Optional[torch.FloatTensor] = None
+    # loss_tts: Optional[torch.FloatTensor] = None
+    # loss_vqa: Optional[torch.FloatTensor] = None
+    # loss_svqa: Optional[torch.FloatTensor] = None
+    # loss_t2i: Optional[torch.FloatTensor] = None
+    # loss_s2i: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+_CHECKPOINT_FOR_DOC = "Dream-7B"
+_CONFIG_FOR_DOC = "DreamConfig"
+import os
+ENFORCE_NUM_ITEMIN_BATCH = os.environ.get("ENFORCE_NUM_ITEMIN_BATCH", False)
+@dataclass
+class BaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    past_key_values: Optional[Cache] = None
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Dream
+class DreamRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DreamRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Dream
+class DreamRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[DreamConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`DreamRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def reset_parameters(self):
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, self.inv_freq.device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Dream
+class DreamMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class DreamAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: DreamConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = False
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = DreamRotaryEmbedding(config=self.config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class DreamSdpaAttention(DreamAttention):
+    """
+    Dream attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DreamAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from DreamAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DreamModel is using DreamSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        # breakpoint()
+        # ForkedPdb().set_trace()
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # causal_mask = attention_mask
+        # if attention_mask is not None:  # no matter the length, we just slice it
+        #     causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        # is_causal = True if causal_mask is None and q_len > 1 else False
+        bool_mask = attention_mask.to(torch.bool)
+        #原始的
+        # ForkedPdb().set_trace()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=bool_mask ,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=False, # hard coded
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+        #换成flash attn
+        # attention_interface = ALL_ATTENTION_FUNCTIONS["flash_attention_2"]
+        # # ForkedPdb().set_trace()
+        # attn_output, attn_weights = attention_interface(
+        #     self,
+        #     query_states,
+        #     key_states,
+        #     value_states,
+        #     attention_mask,
+        #     dropout=0.0 if not self.training else self.attention_dropout,
+        #     scaling=self.head_dim**-0.5,
+        #     sliding_window=None,
+        #     position_ids=position_ids,
+        #     output_attentions= output_attentions,
+        #     use_cache = use_cache
+        #     # 其他参数
+        # )
+        # # attn_output = attn_output.transpose(1, 2).contiguous()
+        # attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        # attn_output = self.o_proj(attn_output)
+        # return attn_output, attn_weights, past_key_value
+class DreamDecoderLayer(nn.Module):
+    def __init__(self, config: DreamConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = Dream_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = DreamSdpaAttention(config, layer_idx)
+        self.mlp = DreamMLP(config)
+        self.input_layernorm = DreamRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DreamRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    # @torch.compile
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        # ForkedPdb().set_trace()
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class DreamPreTrainedModel(PreTrainedModel):
+    config_class = DreamConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DreamDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        _model,_  = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+# _model[0].generation_config
+#         ForkedPdb().set_trace()
+        # NOTE(Lin): we need to override the generation config
+        # because the generation config loaded in `from_pretrained`
+        # does not include all the attributes of DreamGenerationConfig
+        resume_download = kwargs.get("resume_download", None)
+        proxies = kwargs.get("proxies", None)
+        subfolder = kwargs.get("subfolder", "")
+        from_auto_class = kwargs.get("_from_auto", False)
+        from_pipeline = kwargs.get("_from_pipeline", None)
+        _model.generation_config= DreamGenerationConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            _from_auto=from_auto_class,
+            _from_pipeline=from_pipeline,
+        )
+        return _model,_
+class DreamPrefixLMCache(Cache):
+    def __init__(self):
+        super().__init__()
+        self.past_key_values = {}
+        # this will not be updated beyond the prefilling phase
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if layer_idx in self.past_key_values:
+            past_key, past_value = self.past_key_values[layer_idx]
+            key_states = torch.cat((past_key, key_states), dim=-2)
+            value_states = torch.cat((past_value, value_states), dim=-2)
+            return key_states,value_states
+        else:
+            self.past_key_values[layer_idx] = (key_states, value_states)
+            return key_states, value_states
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # TODO: deprecate this function in favor of `cache_position`
+        if len(self.past_key_values) == 0:
+            return 0
+        else:
+            return self.past_key_values[0][0].shape[-2]
+    def get_max_cache_shape(self) -> Optional[int]:
+        return None
+import deepspeed
+class DreamBaseModel(DreamPreTrainedModel):#
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DreamDecoderLayer`]
+    Args:
+        config: DreamConfig
+    """
+    def __init__(self, config: DreamConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DreamDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = DreamRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DreamRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.audio_model = AudioEncoder()
+        self.audio_projection = ResamplerProjector(512, config.hidden_size)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        audios: Optional[torch.FloatTensor] = None,
+        audio_indices: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        # ForkedPdb().set_trace()
+        if (past_key_values is None or len(past_key_values) == 0) and audios is not None:
+            audio_embeds, audio_lengths = self.audio_model(audios)
+            # if torch.distributed.get_rank() == 0:
+            #     print(f"audio_embeds {audio_embeds.size()}")
+            assert audio_embeds.shape[0] == len(audios)
+            fake_audios = None
+            audio_embeds = self.audio_projection(audio_embeds)
+            # torch.set_printoptions(threshold=100_000)
+            # if torch.distributed.get_rank() == 0:
+            #     print(f"audio_embeds {audio_embeds.size()}")
+            #     print(f"audio_embeds {audio_embeds.sum()}")
+            #     print(f"audios {[x.size() for x in audios]}")
+            #     print(f"audios {[x.sum() for x in audios]}")
+            #     print(f"input_ids {input_ids.size()}")
+            #     print(f"input_ids {input_ids.sum()}")
+            #     # print(f"input_ids {input_ids}")
+            #     print(f"audio_indices {[x.size() for x in audio_indices]}")
+            #     print(f"audio_indices {[x.sum() for x in audio_indices]}")
+            #     # print(f"audio_indices {audio_indices}")
+        elif self.training:
+            device = self.get_input_embeddings().weight.data.device
+            dtype = self.get_input_embeddings().weight.data.dtype
+            fake_audios = torch.ones((1, 1, 560), dtype=dtype, device=device)
+            audio_embeds, audio_lengths = self.audio_model(fake_audios)
+            audio_embeds = self.audio_projection(audio_embeds)
+        else:
+            fake_audios = None
+            audio_embeds = None
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if fake_audios is not None:
+            inputs_embeds = inputs_embeds + audio_embeds.mean() * 0.0
+        elif audio_embeds is not None:
+            inputs_embeds = inputs_embeds.clone()
+            for audio_embeds_, audio_lengths_, audio_indices_ in zip(audio_embeds, audio_lengths, audio_indices,):
+                # print(f"{audio_embeds_.size()=} {audio_lengths_=} {audio_indices_.size()=}")
+                audio_embeds_ = audio_embeds_[:audio_lengths_, ...]
+                audio_embeds_ = audio_embeds_.to(inputs_embeds.device)
+                indices_b, indices_s = audio_indices_.to(inputs_embeds.device).unbind(dim=0)
+                inputs_embeds[indices_b.view(-1), indices_s.view(-1)] = audio_embeds_.view(-1, audio_embeds_.shape[-1])
+            # inputs_embeds = inputs_embeds + audio_embeds.mean() * 0.0
+        if use_cache and past_key_values is None:
+            past_key_values = DreamPrefixLMCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = deepspeed.checkpointing.checkpoint(
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            # breakpoint()
+            if isinstance(layer_outputs,torch.Tensor):
+                layer_outputs = (layer_outputs,None)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            past_key_values=past_key_values,
+        )
+class DreamModel(DreamGenerationMixin, DreamPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DreamBaseModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.tokenizer = None
+        self.post_init()
+    def reset_rope_parameters(self):
+        self.model.rotary_emb.reset_parameters()
+        for layer in self.model.layers:
+            layer.self_attn.rotary_emb.reset_parameters()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        audios: Optional[torch.FloatTensor] = None,
+        audio_indices: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        num_items_in_batch: int = None,
+        **loss_kwargs,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        # eos_id = 151643                # 自定义 <eos>
+        # mask_id = 151666                # 自定义 <mask>
+        # import pdb; pdb.set_trace()      # ⚠ 调试断点，如无需要可删
+        # raw_inputs_ids = input_ids       # 保留原始 ID，后续需要对齐 labels
+        # ---------------------------------------------------------
+        # 1. 将 <eos> 位置从注意力 & labels 中临时移除（参见 Sec B.1）
+        # ---------------------------------------------------------
+        #最终的输出也需要 eos，或者说输入的最后就应该全是eos
+        # non_padding = ~(raw_inputs_ids == eos_id)
+        # 强制让 <eos> 位在 attention_mask 中视为 *可被注意*（True）
+        # attention_mask[raw_inputs_ids == eos_id] = True
+        # labels 位置恢复成 eos_id，避免被 -100 忽略
+        # 更新labels，让模型能学到eos，但是又不想弄太多eos来影响训练
+        # input_ids, labels = update_labels_and_inputs(input_ids,labels,eos_id,300)
+        # new_attention_mask, first_non_neg_idx_list, last_non_neg_idx_list = generate_attention_mask(new_lables)
+        # first_non_neg_idx_list里面可能有None
+        # ---------------------------------------------------------
+        # 3. 若存在 labels（训练模式），进行 Forward‑Process：
+        #    • 采样需要 Mask 的 token 下标 (masked_indices)
+        #    • 为每个样本构造互补分支 (masked / inverse masked)
+        #    • 拼接两条分支，得到 2×batch 的输入 / labels
+        # ---------------------------------------------------------
+        # if labels is not None:
+        #     # audio 这一块应该没有这个必要？    不过训练也行吧
+        #     labels_mask           = ~(labels == -100)     # label != -100,assitant 部分
+        #     # noise_embeddings      = self.get_input_embeddings()(torch.tensor([mask_id]).to(raw_inputs_ids))  # (1, D)
+        #     bsz, seq_len          = labels_mask.shape
+        #     # noise_embeddings      = noise_embeddings.view(1, 1, -1)  # mask token 的embedding
+        #     # 生成masked_indices, p_mask
+        #     masked_indices, p_mask = forward_process(
+        #         bsz, seq_len, raw_inputs_ids.device, labels
+        #     )
+        #     # ForkedPdb().set_trace()
+        #     # 只mask有效token
+        #     final_masked_indices = masked_indices & labels_mask
+        #     final_masked_indices_inv = (~masked_indices) & labels_mask
+        #     # mask_id要和input_ids类型、设备一致
+        #     mask_id_tensor = torch.full_like(input_ids, mask_id)
+        #     input_ids = torch.where(final_masked_indices, mask_id_tensor, input_ids)
+        #     # new_labels是labels的clone
+        #     new_labels = labels.clone()
+        #     new_labels[final_masked_indices_inv] = -100
+            # final_masked_indices_inv = (~masked_indices) & labels_mask  #assistant并且没有被mask部分
+            # 使用 torch.where 将目标 token 替换为噪声向量
+            #
+            #这里改成把输入换成mask tokne的id就行
+            # inputs_embeds_inv = torch.where(final_masked_indices_inv.view(bsz, seq_len, 1),
+            #                                 noise_embeddings, inputs_embeds)   #没被mask部分
+            # inputs_embeds     = torch.where(final_masked_indices.view(bsz, seq_len, 1),
+            #                                 noise_embeddings, inputs_embeds) #mask部分
+            # ForkedPdb().set_trace()
+            # 构造两份 labels：各自只在对应分支需要预测的位置保留真值，其余填 -100
+            # labels_inv                = labels.clone()
+            # labels_inv[~final_masked_indices_inv] = -100
+            # labels[~final_masked_indices]         = -100
+            # 将两条分支沿 batch 维度拼接：
+            # 文章里面说的是，视觉元素可能出现在没被mask的地方，导致训了没啥用
+            # inputs_embeds        = torch.cat([inputs_embeds, inputs_embeds_inv])
+            # labels               = torch.cat([labels, labels_inv])
+            # final_masked_indices = torch.cat([final_masked_indices, final_masked_indices_inv])
+            # Debug: 打印序列长度
+            # seq_len = labels.shape[-1]
+            # print(f"[forward] seq_len={seq_len}")
+        # ---------------------------------------------------------
+        # 4. (可选) DPO ‑style 正/反样本前向；此处暂未实现
+        # ---------------------------------------------------------
+        # if dpo_forward:
+        #     raise NotImplementedError("DPO forward 尚未实现，请按需补充")
+        # ForkedPdb().set_trace()
+        # ---------------------------------------------------------
+        # 5. 常规前向 — 调用基类实现
+        # ---------------------------------------------------------
+        # attention_mask = None  # ⚠ 此处把 mask 置空，让基类自己处理（或依赖 ALiBi）
+        #import pdb; pdb.set_trace()
+        #import time
+        #print(f"begin forward - {time.time()} - {input_ids.device}")
+        num_items_in_batch = None
+        if ENFORCE_NUM_ITEMIN_BATCH:
+            num_items_in_batch = labels.ne(-100).sum()
+            num_items_in_batch = torch.distributed.reduce(num_items_in_batch)
+        # ForkedPdb().set_trace()
+        # lables = new_labels
+        # new_attention_mask = None
+        # attention_mask = new_attention_mask
+        is_new = position_ids == 0
+        # is_new[0] = True
+        segment_id = torch.cumsum(is_new.long(), dim=1) - 1
+        new_attention_mask = (segment_id.unsqueeze(1) == segment_id.unsqueeze(2)).long()
+        # ForkedPdb().set_trace()
+        mask = attention_mask.unsqueeze(-1)  # [bs, len, 1]
+        new_attention_mask = new_attention_mask * mask  # [bs, len, len] * [bs, len, 1]，自动broadcast
+        if self.config.chunk_size > 0:
+            item_start_id = torch.where(position_ids[0] == 0)[0]
+            im_start_id = torch.where(input_ids[0] == self.tokenizer.encode("<|im_start|>")[0])[0].tolist()
+            chunk_mask = torch.zeros_like(new_attention_mask)
+            for item_i in range(len(item_start_id)):
+                im_start = item_start_id[item_i]
+                im_end = item_start_id[item_i + 1] if item_i != len(item_start_id) - 1 else input_ids.shape[-1]
+                im_index = im_start_id.index(im_start)
+                chunk_begin = im_start
+                while im_index < len(im_start_id) and im_start_id[im_index] < im_end:
+                    if self.tokenizer.decode(input_ids[0, im_start_id[im_index]+1]) == "assistant":
+                        chunk_id = 1
+                        ans_begin = im_start_id[im_index]
+                        ans_end = im_start_id[im_index + 1] if im_index != len(im_start_id) - 1 else input_ids.shape[-1]
+                        while 1:
+                            chunk_end = min(ans_begin + chunk_id * self.config.chunk_size, ans_end)
+                            chunk_mask[:, chunk_begin: chunk_end, im_start: chunk_end] = 1
+                            if chunk_end == ans_end: break
+                            chunk_id += 1
+                            chunk_begin = chunk_end
+                        chunk_begin = chunk_end
+                        im_index += 1
+                    else:
+                        im_index += 1; continue
+            new_attention_mask = new_attention_mask * chunk_mask
+            # visualization
+            # import matplotlib.pyplot as plt
+            # mask_np = new_attention_mask[0].detach().cpu().numpy()
+            # plt.figure(figsize=(10, 8)); plt.imshow(mask_np); plt.savefig("tmp.png"); plt.close()
+            # if chunk_num != (position_ids == 0).sum(): import pdb; pdb.set_trace()
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # import pdb;pdb.set_trace()
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        # position_ids = torch.arange(input_ids.size(1), dtype=torch.long).unsqueeze(0)
+        # position_ids = torch.arange(
+        #     input_ids.size(1),
+        #     dtype=torch.long,
+        #     device=input_ids.device
+        # ).unsqueeze(0).expand(input_ids.size(0), -1)
+        # print(input_ids.shape,labels.shape)
+        #import time
+        #print(f"self.model forward - {time.time()} - {input_ids.device}")
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=new_attention_mask,
+            audios=audios,
+            audio_indices=audio_indices,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        # import pdb;pdb.set_trace()
+        loss = None
+        if labels is not None:
+            if ENFORCE_NUM_ITEMIN_BATCH:
+                assert num_items_in_batch is not None, "num_items_in_batch must be provided if ENFORCE_NUM_ITEMIN_BATCH is True"
+            # ForkedPdb().set_trace()
+            loss = self.loss_function(logits, labels, self.vocab_size,num_items_in_batch=num_items_in_batch, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        # ForkedPdb().set_trace()
+        #import time
+        #print(f"forward finish - {time.time()} - {input_ids.device}")
+        # loss_t2i = None
+        # loss_s2i = None
+        # loss_vqa = None
+        # loss_svqa = None
+        # loss_asr = None
+        # loss_tts = None
+        # loss_tqa = None
+        # loss_sqa = None
+        # input_text = self.tokenizer.decode(input_ids[0])
+        # t2i_prompt = get_t2i_prompt()
+        # for p in t2i_prompt:
+        #     if p in input_text: loss_t2i = loss.detach().copy(); break
+        # if "Convert the speech to text." in input_text: loss_asr = loss.detach().copy()
+        # elif "Convert the text to speech." in input_text: loss_tts = loss.detach().copy()
+        # elif "<|image" not in input_text and "<|audio" not in input_text and loss_t2i is None: loss_tqa = loss.detach().copy()
+        # elif "<|image|>" in input_text and loss_t2i is None: in input_text: loss_vqa = loss.detach().copy()
+        # elif "Please response the input audio." in input_text: loss_sqa = loss.detach().copy()
+        # elif "Please generate an image based on the input audio." in input_text: loss_s2i = loss.detach().copy()
+        # elif "Please response the input audio based on the given image." in input_text: loss_svqa = loss.detach().copy()
+        return MaskedLMOutput(
+            loss=loss,
+            # loss_asr=loss_asr,
+            # loss_tts=loss_tts,
+            # loss_tqa=loss_tqa,
+            # loss_sqa=loss_sqa,
+            # loss_t2i=loss_t2i,
+            # loss_s2i=loss_s2i,
+            # loss_vqa=loss_vqa,
+            # loss_svqa=loss_svqa,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            past_key_values=outputs.past_key_values
+        )
+    def forward_dream(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        attention_mask = None
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        # import pdb;pdb.set_trace()
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            past_key_values=outputs.past_key_values,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        audios: Optional[torch.FloatTensor] = None,
+        audio_indices: Optional[torch.LongTensor] = None,
+        max_new_tokens=512,
+        steps=512,
+        temperature=0.2,
+        top_p=0.95,
+        alg_temp=0.,
+        alg="entropy",
+        output_history=False,
+        **kwargs,
+    ):
+        # modalities = kwargs.pop("modalities", None) if "modalities" in kwargs and modalities is None else modalities
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        # import pdb;pdb.set_trace()
+        # if images is not None:
+        #     (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
+        # else:
+        #     # breakpoint()
+        #     inputs_embeds = self.get_model().embed_tokens(inputs)
+        #return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+        #return llada_generate(self.get_model(),inputs_embeds=inputs_embeds,position_ids=position_ids,attention_mask=attention_mask,**kwargs)
+        # breakpoint()
+        # ForkedPdb().set_trace()
+        if audios is not None:
+            audio_embeds, audio_lengths = self.model.audio_model(audios)
+            # if torch.distributed.get_rank() == 0:
+            #     print(f"audio_embeds {audio_embeds.size()}")
+            assert audio_embeds.shape[0] == len(audios)
+            fake_audios = None
+            audio_embeds = self.model.audio_projection(audio_embeds)
+            # torch.set_printoptions(threshold=100_000)
+            # if torch.distributed.get_rank() == 0:
+            #     print(f"audio_embeds {audio_embeds.size()}")
+            #     print(f"audio_embeds {audio_embeds.sum()}")
+            #     print(f"audios {[x.size() for x in audios]}")
+            #     print(f"audios {[x.sum() for x in audios]}")
+            #     print(f"input_ids {input_ids.size()}")
+            #     print(f"input_ids {input_ids.sum()}")
+            #     # print(f"input_ids {input_ids}")
+            #     print(f"audio_indices {[x.size() for x in audio_indices]}")
+            #     print(f"audio_indices {[x.sum() for x in audio_indices]}")
+            #     # print(f"audio_indices {audio_indices}")
+        elif self.training:
+            device = self.model.get_input_embeddings().weight.data.device
+            dtype = self.model.get_input_embeddings().weight.data.dtype
+            fake_audios = torch.ones((1, 1, 560), dtype=dtype, device=device)
+            audio_embeds, audio_lengths = self.model.audio_model(fake_audios)
+            audio_embeds = self.model.audio_projection(audio_embeds)
+        else:
+            fake_audios = None
+            audio_embeds = None
+        # if inputs_embeds is None:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        if fake_audios is not None:
+            inputs_embeds = inputs_embeds + audio_embeds.mean() * 0.0
+        elif audio_embeds is not None:
+            inputs_embeds = inputs_embeds.clone()
+            for audio_embeds_, audio_lengths_, audio_indices_ in zip(audio_embeds, audio_lengths, audio_indices,):
+                # print(f"{audio_embeds_.size()=} {audio_lengths_=} {audio_indices_.size()=}")
+                audio_embeds_ = audio_embeds_[:audio_lengths_, ...]
+                audio_embeds_ = audio_embeds_.to(inputs_embeds.device)
+                indices_b, indices_s = audio_indices_.to(inputs_embeds.device).unbind(dim=0)
+                inputs_embeds[indices_b.view(-1), indices_s.view(-1)] = audio_embeds_.view(-1, audio_embeds_.shape[-1])
+            # inputs_embeds = inputs_embeds + audio_embeds.mean() * 0.0
+        return self.diffusion_generate(
+            None,
+            inputs_embeds=inputs_embeds,
+            max_new_tokens=max_new_tokens,
+            output_history=output_history,
+            return_dict_in_generate=True,
+            steps=steps,
+            temperature=temperature,
+            top_p=top_p,
+            alg=alg,
+            alg_temp=alg_temp,
+            **kwargs
+        )
+# class LlavaDreamForMaskedDiffusion(DreamModel,DreamPreTrainedModel):
+#     # config_class = LlavaDreamConfig
+#     supports_gradient_checkpointing = True
+#     def __init__(self, config: DreamConfig, model: Optional[DreamModel] = None, init_params: bool = False,vision_kwargs=None,**kwargs):
+#         DreamModel.__init__(self, config)
+#         # configure default generation settings
+#         config.model_type = "llava_dream"
+#         # config.rope_scaling = None
+#         # if not model:
+#         self.model = DreamModel(config)
+#         # else:
+#         #     self.model = model
+#         #self.model.set_activation_checkpointing('whole_layer')
+#         self.post_init() # TODO
+#     def get_model(self):
+#         return self.model

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "additional_special_tokens": [
+    "<|beginoftext|>",
+    "<|mask|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|context_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>",
+    "<|context_of_video|>",
+    "<|begin_of_patch|>",
+    "<|end_of_patch|>",
+    "<|context_of_patch|>",
+    "<|begin_of_quad|>",
+    "<|end_of_quad|>",
+    "<|begin_of_ref|>",
+    "<|end_of_ref|>",
+    "<|begin_of_box|>",
+    "<|end_of_box|>",
+    "<|image|>",
+    "<|video|>",
+    "<|begin_of_audio|>",
+    "<|end_of_audio|>",
+    "<|context_of_audio|>",
+    "<|audio|>"
+  ],
+  "bos_token": {
+    "content": "<|beginoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<|mask|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_dream.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Qwen's implementations in this library.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Dream."""
+import json
+import os
+import unicodedata
+from functools import lru_cache
+from typing import Optional, Tuple
+import regex as re
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+MAX_MODEL_INPUT_SIZES = {"dream/dream-tokenizer": 32768}
+PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+@lru_cache()
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class DreamTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Dream tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+    ```python
+    >>> from transformers import AutoTokenizer
+    >>> tokenizer = AutoTokenizer.from_pretrained("Dream-org/Dream-v0-Base-7B", trust_remote_code=True)
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
+            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
+            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        clean_up_tokenization_spaces=False,
+        split_special_tokens=False,
+        **kwargs,
+    ):
+        # Dream vocab does not contain control tokens; added tokens need to be special
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_merges = []
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            for i, line in enumerate(merges_handle):
+                line = line.strip()
+                if (i == 0 and line.startswith("#version:")) or not line:
+                    continue
+                bpe_merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        # NOTE: the cache can grow without bound and will get really large for long running processes
+        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
+        # not a memory leak but appears as one.
+        # GPT2Tokenizer has the same problem, so let's be consistent.
+        self.cache = {}
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+        if kwargs.get("add_prefix_space", False):
+            logger.warning_once(
+                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
+            )
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            split_special_tokens=split_special_tokens,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
+        # and cannot be configured elsewhere, but it should default to False for DreamTokenizer
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        return vocab_file, merge_file
+    def prepare_for_tokenization(self, text, **kwargs):
+        text = unicodedata.normalize("NFC", text)
+        return (text, kwargs)

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff