Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +1 -0
README.md +298 -0
added_tokens.json +25 -0
chat_template.jinja +54 -0
config.json +60 -0
generation_config.json +9 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_qwen2.py +171 -0
special_tokens_map.json +39 -0
tokenizer.json +3 -0
tokenizer_config.json +217 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,298 @@

+---
+license: apache-2.0
+---
+<center> <div style="text-align: center;"> <img src="https://raw.githubusercontent.com/ZHZisZZ/dllm/main/assets/logo.gif" width="400" />
+ </div> </center>
+# Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1
+Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 is a diffusion-based language model created by transforming the autoregressive backbone [Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct) into a diffusion architecture and fine-tuning it using block diffusion techniques within the [dLLM](https://github.com/ZHZisZZ/dllm) framework.
+## Model Overview
+Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 has the following features:
+<!-- - **Architecture**: Transformer encoder with 8192-token context -->
+- **Training Objective**: [Block Discrete Denoising Diffusion Language Models (BD3-LMs)](https://arxiv.org/pdf/2503.09573)
+- **Framework**: [dLLM](https://github.com/ZHZisZZ/dllm)
+- **Base Model**: [Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct)
+- **Datasets**: [opc-sft-stage1](https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage1) and [opc-sft-stage2](https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage2)
+For training details, see the [W&B report](https://wandb.ai/asap-zzhou/dllm/reports/dLLM-Tiny-A2D--VmlldzoxNTI2NTEzOA).
+## Installation
+```shell
+pip install torch transformers accelerate
+```
+## Quick Start
+```python
+import math
+import copy
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+def add_gumbel_noise(logits, temperature):
+    if temperature == 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    g = (-torch.log(noise)) ** temperature
+    return logits.exp() / g
+def get_num_transfer_tokens(mask_index, steps):
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    rem = mask_num % steps
+    out = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.long) + base
+    for i in range(mask_num.size(0)):
+        out[i, : rem[i]] += 1
+    return out
+def build_staircase_attention_mask(x, block_size, pad_id):
+    B, T = x.shape
+    device = x.device
+    valid = x != pad_id
+    pos_raw = torch.cumsum(valid.long(), dim=-1)
+    position_ids = torch.where(valid, pos_raw - 1, torch.zeros_like(pos_raw)).long()
+    col = torch.arange(T, device=device)
+    block_ids = (col // block_size).view(1, T).expand(B, T)
+    block_ids = torch.where(valid, block_ids, torch.full_like(block_ids, -1))
+    q = block_ids.view(B, 1, T, 1)
+    k = block_ids.view(B, 1, 1, T)
+    attn = (k <= q) & (q >= 0) & (k >= 0)
+    return attn, position_ids
+def diffusion_step_block(logits, x_block, mask_block, num_transfer, temperature, remasking):
+    B, L, _ = logits.shape
+    if not mask_block.any():
+        return x_block
+    noisy = add_gumbel_noise(logits, temperature)
+    x0 = noisy.argmax(dim=-1)
+    if remasking == "low_confidence":
+        p = F.softmax(logits, dim=-1)
+        conf = p.gather(-1, x0.unsqueeze(-1)).squeeze(-1)
+    elif remasking == "random":
+        conf = torch.rand((B, L), device=logits.device)
+    else:
+        raise ValueError(remasking)
+    x0 = torch.where(mask_block, x0, x_block)
+    neg_inf = torch.full_like(conf, -float("inf"))
+    conf = torch.where(mask_block, conf, neg_inf)
+    commit = torch.zeros_like(x_block, dtype=torch.bool)
+    for i in range(B):
+        k = int(num_transfer[i].item())
+        if k > 0:
+            valid = (conf[i] > -float("inf")).sum().item()
+            k = min(k, valid)
+            _, idx = torch.topk(conf[i], k)
+            commit[i, idx] = True
+    out = x_block.clone()
+    out[commit] = x0[commit]
+    return out
+@torch.no_grad()
+def generate(
+    model,
+    tokenizer,
+    prompt,
+    steps=128,
+    max_new_tokens=128,
+    block_size=32,
+    temperature=0.0,
+    cfg_scale=0.0,
+    remasking="low_confidence",
+):
+    device = model.device
+    mask_id = tokenizer.mask_token_id
+    bos_id = tokenizer.bos_token_id
+    pad_id = tokenizer.pad_token_id
+    prompt = torch.tensor(prompt, device=device).long()
+    B = 1
+    T0 = len(prompt)
+    x = prompt
+    num_blocks = math.ceil(max_new_tokens / block_size)
+    steps_per_block = math.ceil(steps / num_blocks)
+    generated = 0
+    while generated < max_new_tokens:
+        T_prefix = x.size(1)
+        offset = T_prefix % block_size
+        room = block_size if offset == 0 else block_size - offset
+        cur_len = min(room, max_new_tokens - generated)
+        if cur_len <= 0:
+            break
+        attn_pfx, pos_pfx = build_staircase_attention_mask(x, block_size, pad_id)
+        out = model(x, attention_mask=attn_pfx, position_ids=pos_pfx, use_cache=True)
+        cond_past = out.past_key_values
+        prefix_logits = out.logits[:, -1:, :]
+        if cfg_scale > 0:
+            un_x = x.clone()
+            un_x[:] = mask_id
+            out_un = model(un_x, attention_mask=attn_pfx, position_ids=pos_pfx, use_cache=True)
+            uncond_past = out_un.past_key_values
+        else:
+            uncond_past = None
+        block = torch.full((B, cur_len), mask_id, device=device, dtype=torch.long)
+        x = torch.cat([x, block], dim=1)
+        T_total = x.size(1)
+        block_mask = x[:, -cur_len:] == mask_id
+        num_transfer = get_num_transfer_tokens(block_mask, steps_per_block)
+        eff_steps = num_transfer.size(1)
+        full_attn, full_pos = build_staircase_attention_mask(x, block_size, pad_id)
+        attn_blk = full_attn[:, :, T_prefix:T_total, :]
+        pos_blk = full_pos[:, T_prefix:T_total]
+        for t in range(eff_steps):
+            x_blk = x[:, T_prefix:T_total]
+            m_blk = x_blk == mask_id
+            cond_logits = model(
+                x_blk, attention_mask=attn_blk, position_ids=pos_blk,
+                past_key_values=copy.deepcopy(cond_past), use_cache=False
+            ).logits
+            logits = cond_logits
+            if cfg_scale > 0:
+                un_logits = model(
+                    x_blk, attention_mask=attn_blk, position_ids=pos_blk,
+                    past_key_values=copy.deepcopy(uncond_past), use_cache=False
+                ).logits
+                logits = un_logits + (cfg_scale + 1.0) * (cond_logits - un_logits)
+            x_blk_new = diffusion_step_block(
+                logits, x_blk, m_blk, num_transfer[:, t], temperature, remasking
+            )
+            x[:, T_prefix:T_total] = x_blk_new
+        if (x_blk_new == tokenizer.eos_token_id).any():
+            break
+        generated += cur_len
+    return x
+device = "cuda"
+model = AutoModelForMaskedLM.from_pretrained("dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1", dtype=torch.bfloat16, trust_remote_code=True).to(device).eval()
+tokenizer = AutoTokenizer.from_pretrained("dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1", trust_remote_code=True)
+prompt = "Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours?"
+m = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {"role": "user", "content": prompt}
+]
+prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+input_ids = tokenizer(prompt)["input_ids"]
+input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
+text = generate(model,tokenizer, input_ids, steps=256, max_new_tokens=256, block_size=32, temperature=0.0, cfg_scale=0.0, remasking="low_confidence")
+print(tokenizer.batch_decode(text[:, input_ids.shape[1]:], skip_special_tokens=False)[0])
+```
+## Generation Parameters
+| Parameter        | Description                                                                                    | Default  |
+| ---------------- | ---------------------------------------------------------------------------------------------- | -------- |
+| `max_new_tokens` | Number of tokens to generate                                                                   | 256      |
+| `steps`          | Number of diffusion denoising iterations                                                       | 256      |
+| `temperature`    | Sampling temperature; set to `0.0` for deterministic generation                                | 0.0      |
+| `block_size`   | Token block size used during iterative denoising                                               | 32       |
+| `cfg_scale`      | Classifier-free guidance scale controlling instruction adherence (higher = more deterministic) | 0.0      |
+| `remasking`      | Strategy for re-masking during each denoising step (`random` or `low_confidence`)         | `low_confidence` |
+## Command-Line Interface
+Follow the Github repo's demo script [examples/a2d/bm3lm/chat.py](https://github.com/ZHZisZZ/dllm/blob/main/examples/a2d/bm3lm/chat.py) for visualized generation:
+```shell
+python -u examples/a2d/bm3lm/chat.py \
+  --model_name_or_path dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 \
+  --chat True
+```
+## Evaluation
+<table style="border-collapse: collapse; width: 60%; text-align: center;">
+  <thead>
+    <tr style="border-bottom: 3px solid #333;">
+      <th style="padding: 8px; min-width: 320px; text-align: left;">Model         </th>
+      <th style="padding: 8px;">HumanEval</th>
+      <th style="padding: 8px;">MBPP</th>
+    </tr>
+  </thead>
+  <!-- Diffusion model v1.1 highlighted -->
+  <tr style="background-color: #e8f2ff;">
+    <td style="padding: 8px;"><a href="https://huggingface.co/dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1"><code>Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1</code></a> (evaluated)</td>
+    <td>41.5</td><td>33.6</td>
+  </tr>
+  <!-- Diffusion model v0.1 highlighted -->
+  <tr style="background-color: #e8f2ff;">
+    <td style="padding: 8px;"><a href="https://huggingface.co/dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v0.1"><code>Qwen2.5-Coder-0.5B-Instruct-diffusion-v0.1</code></a> (evaluated)</td>
+    <td>28.1</td><td>23.0</td>
+  </tr>
+  <tr style="background-color: #e8f2ff;">
+    <td style="padding: 8px;"><a href="https://huggingface.co/fredzzp/open-dcoder-0.5B"><code>open-dcoder-0.5B</code></a> (reported)</td>
+    <td>20.8</td><td>35.2</td>
+  </tr>
+  <tr>
+    <td colspan="3" style="padding: 0; border-top: 3px double #666;"></td>
+  </tr>
+  <tr>
+    <td style="padding: 8px;"><a href="https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct"><code>Qwen2.5-Coder-0.5B-Instruct</code></a> (reported)</td>
+    <td>28.0</td><td>52.9</td>
+  </tr>
+</table>
+To automatically evaluate Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 on all benchmarks, run:
+```shell
+bash examples/a2d/eval_bm3lm.sh \
+  --model_name_or_path dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1
+```
+## Citation
+If you use Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 or dLLM, please cite:
+```bibtex
+@misc{dllm,
+  author = {Zhanhui Zhou and Lingjie Chen and Hanghang Tong and Dawn Song},
+  title = {dLLM: Simple Diffusion Language Modeling},
+  year = {2025},
+  howpublished = {\url{https://github.com/ZHZisZZ/dllm}},
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|mask|>": 151665,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "architectures": [
+    "A2DQwen2LMHeadModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_qwen2.A2DQwen2Config",
+    "AutoModel": "modeling_qwen2.A2DQwen2Model",
+    "AutoModelForMaskedLM": "modeling_qwen2.A2DQwen2LMHeadModel"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 24,
+  "model_type": "a2d-qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645
+  ],
+  "pad_token_id": 151643,
+  "transformers_version": "4.57.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1724b7f2e845ab597ad34defbfa61073551f7c8333f769a4558a0536849517be
+size 1260367448

modeling_qwen2.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from typing import Optional
+import torch
+from torch import nn
+import transformers
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+if transformers.utils.is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size
+    from torch.nn.attention.flex_attention import BlockMask, create_block_mask
+else:
+    # Register a fake type to avoid crashing for annotations and `isinstance` checks
+    BlockMask = torch.Tensor
+class A2DQwen2Config(transformers.Qwen2Config):
+    model_type = "a2d-qwen2"  # <- NEW model_type
+class A2DQwen2Model(transformers.Qwen2Model):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        """
+        # -------------------------------------------------------------
+        # ORIGINAL CODE (causal mask)
+        # -------------------------------------------------------------
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        # -------------------------------------------------------------
+        # ORIGINAL CODE (causal mask)
+        # -------------------------------------------------------------
+        """
+        # -------------------------------------------------------------
+        # NEW CODE (bidirectional, padding-only mask)
+        # -------------------------------------------------------------
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # 1) If no mask is provided → treat all tokens as valid (no padding)
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    inputs_embeds.shape[:2],
+                    device=inputs_embeds.device,
+                    dtype=torch.long
+                )
+            # 2) If mask is not already a 4D attention mask → convert it
+            if not (
+                isinstance(attention_mask, BlockMask)
+                or (isinstance(attention_mask, torch.Tensor) and attention_mask.ndim == 4)
+            ):
+                attention_mask = _prepare_4d_attention_mask(attention_mask, self.dtype)
+            # 3) Build causal mask mapping used by the attention layers
+            causal_mask_mapping = {"full_attention": attention_mask}
+            # Sliding-window layers share the same non-causal mask
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = attention_mask
+        # -------------------------------------------------------------
+        # NEW CODE (bidirectional, padding-only mask)
+        # -------------------------------------------------------------
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class A2DQwen2LMHeadModel(transformers.Qwen2ForCausalLM):
+    config: A2DQwen2Config
+    def __init__(self, config):
+        transformers.Qwen2PreTrainedModel.__init__(self, config)
+        self.model = A2DQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+transformers.AutoConfig.register("a2d-qwen2", A2DQwen2Config)
+transformers.AutoModel.register(A2DQwen2Config, A2DQwen2LMHeadModel)
+transformers.AutoModelForMaskedLM.register(A2DQwen2Config, A2DQwen2LMHeadModel)
+if __name__ == "__main__":
+    import dllm
+    import torch
+    from transformers import AutoModel
+    # Load a config from a local path (either a directory containing config.json, or the file itself)
+    config_path = dllm.utils.resolve_with_base_env(
+        "Qwen/Qwen2.5-0.5B", "BASE_MODELS_DIR"
+    )
+    config = A2DQwen2Config.from_pretrained(config_path)
+    if hasattr(config, "auto_map"):
+        delattr(config, "auto_map")
+    if hasattr(config, "architectures"):
+        delattr(config, "architectures")
+    torch.set_default_device("cuda")
+    model = A2DQwen2LMHeadModel(config)
+    model.save_pretrained("models-tmp/a2d-qwen2")
+    auto_model = AutoModel.from_pretrained("models-tmp/a2d-qwen2")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<|mask|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a59820ad3f728fff77cf7e4188532fc45e5f80cd0299cde28046bd2b51c64bdf
+size 11422081

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,217 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|mask|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<|mask|>",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9097661c1514bf9514858f758273a38b09545a96d1619951d90b456d240e3ddc
+size 6840

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff