Instructions to use JorgeVanco/diffusionGPT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use JorgeVanco/diffusionGPT with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="JorgeVanco/diffusionGPT")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("JorgeVanco/diffusionGPT")
model = AutoModelForMaskedLM.from_pretrained("JorgeVanco/diffusionGPT")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use JorgeVanco/diffusionGPT with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "JorgeVanco/diffusionGPT"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "JorgeVanco/diffusionGPT",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/JorgeVanco/diffusionGPT

SGLang

How to use JorgeVanco/diffusionGPT with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "JorgeVanco/diffusionGPT" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "JorgeVanco/diffusionGPT",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "JorgeVanco/diffusionGPT" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "JorgeVanco/diffusionGPT",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use JorgeVanco/diffusionGPT with Docker Model Runner:
```
docker model run hf.co/JorgeVanco/diffusionGPT
```

JorgeVanco commited on Jan 30

Commit

8c2cc2d

verified ·

1 Parent(s): 7c53e69

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

added_tokens.json +8 -0
chat_template.jinja +4 -0
config.json +57 -0
merges.txt +0 -0
model.safetensors +3 -0
pipeline.py +350 -0
special_tokens_map.json +41 -0
tokenizer.json +0 -0
tokenizer_config.json +74 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "<eos>": 50259,
+  "<mask>": 50258,
+  "<pad>": 50257,
+  "<|delete|>": 50260,
+  "<|im_end|>": 50262,
+  "<|im_start|>": 50261
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% for message in messages %}<|im_start|>{{ message['role'] }}
+{% if message['role'] == 'assistant' %}{% generation %}{{ message['content'] }}<|im_end|>{% endgeneration %}{% else %}{{ message['content'] }}<|im_end|>{% endif %}
+{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "ModernBertForMaskedLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50256,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "custom_pipelines": {
+    "text-diffusion": {
+      "impl": "pipeline.TextDiffusionPipeline",
+      "pt": [
+        "AutoModelForMaskedLM"
+      ],
+      "tf": []
+    }
+  },
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50259,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 1280,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "mask_token_id": 50258,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 10,
+  "num_hidden_layers": 20,
+  "pad_token_id": 50257,
+  "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "seq_length": 2048,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.56.2",
+  "use_cache": false,
+  "vocab_size": 50263
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2451886549ee8a6934888a3c296b22d212b3d6725322ab8204fd89db2e532354
+size 2361481436

pipeline.py ADDED Viewed

	@@ -0,0 +1,350 @@

+from transformers import BatchEncoding, Pipeline
+import torch
+from typing import Any, Generator
+class TextDiffusionPipeline(Pipeline):
+    def _sanitize_parameters(
+        self,
+        num_steps: int = 50,
+        allow_edits: bool = True,
+        use_confidence: bool = False,
+        stop_token: None = None,
+        **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
+        # Allow user to control the number of steps (e.g., diffusion steps)
+        # default to 10 steps
+        forward_kwargs = {
+            "num_steps": num_steps,
+            "allow_edits": allow_edits,
+            "use_confidence": use_confidence,
+            "stop_token": stop_token
+        }
+        preprocess_kwargs = {}
+        if "max_length" in kwargs:
+            preprocess_kwargs["max_length"] = kwargs["max_length"]
+        return preprocess_kwargs, forward_kwargs, {}
+    def preprocess(self, input_text, max_length=None) -> BatchEncoding | Any:
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer was not passed to the pipeline!")
+        # Standard tokenization
+        if max_length is None:
+            # Safely access config if it exists, default to 512
+            max_length = getattr(self.model.config, "seq_length", 512)
+        if input_text is None:
+            input_text = ""
+        tokenized_text = self.tokenizer.encode(input_text)
+        if len(tokenized_text) < max_length:
+            input_ids = torch.full((1, max_length), self.tokenizer.mask_token_id, dtype=torch.long) # type: ignore
+            input_ids[0, :len(tokenized_text)] = torch.tensor(tokenized_text, dtype=torch.long)
+            return BatchEncoding({
+                "input_ids": input_ids,
+                "attention_mask": torch.ones_like(input_ids)
+            })
+        return self.tokenizer(
+            input_text,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+        )
+    @torch.no_grad()
+    def diffusion_generator(
+        self,
+        input_ids: torch.Tensor,
+        num_steps: int,
+        allow_edits: bool = True,
+        use_confidence: bool = False
+    ) -> Generator[torch.Tensor, None, None]:
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer was not passed to the pipeline!")
+        current_state: torch.Tensor = input_ids.clone()
+        yield current_state.clone() # Yield Step 0
+        # Determine which tokens can be re-masked (i.e., mask and pad tokens)
+        initial_mask = (current_state == self.tokenizer.mask_token_id) | \
+                       (current_state == self.tokenizer.pad_token_id)
+        for step in range(num_steps):
+            t_current = 1 - step / num_steps
+            t_next = 1 - (step + 1) / num_steps
+            # Predict full text with model
+            output = self.model(input_ids=current_state)
+            logits = output.logits
+            # Set logit that corresponds to the mask token to -inf
+            logits[:, :, self.tokenizer.mask_token_id] = torch.finfo(logits.dtype).min
+            # Ancestral sampling logic
+            probs = torch.softmax(logits, dim=-1)
+            dist = torch.distributions.Categorical(probs)
+            sampled_ids = dist.sample()
+            # Calculate Unmasking Probability (Equation 7 https://arxiv.org/pdf/2406.07524)
+            # P(unmask | masked) = (alpha_s - alpha_t) / (1 - alpha_t)
+            # mapping: alpha_t = (1 - t_current), alpha_s = (1 - t_next)
+            # resulting simplified formula: (t_current - t_next) / t_current
+            if step < num_steps - 1:
+                unmasking_prob = (t_current - t_next) / t_current
+            else:
+                unmasking_prob = 1.0 # Force unmask at the end
+            remasking_mask: torch.Tensor = (current_state == self.tokenizer.mask_token_id) | \
+                             (current_state == self.tokenizer.pad_token_id) # type: ignore
+            if use_confidence:
+                # Get the confidence (probability) of the tokens we just sampled
+                sample_probs = probs.gather(-1, sampled_ids.unsqueeze(-1)).squeeze(-1)
+                # Determine how many tokens to unmask this step
+                if step < num_steps - 1:
+                    num_masked = remasking_mask.sum(dim=1, keepdim=True)
+                    num_to_unmask = (num_masked.float() * unmasking_prob).ceil().long()
+                else:
+                    num_to_unmask = remasking_mask.sum(dim=1, keepdim=True)
+                # Select Top-K most confident tokens
+                # Set confidence of already visible tokens to -inf so they aren't picked
+                candidate_confidences = sample_probs.clone()
+                candidate_confidences[~remasking_mask] = -float('inf')
+                unmasking_mask = torch.zeros_like(remasking_mask, dtype=torch.bool)
+                max_k = num_to_unmask.max().item()
+                if max_k > 0:
+                    _, top_indices = candidate_confidences.topk(k=max_k, dim=1)
+                    range_tensor = torch.arange(max_k, device=current_state.device).unsqueeze(0)
+                    mask_k = range_tensor < num_to_unmask
+                    unmasking_mask.scatter_(1, top_indices, mask_k)
+            else:
+                # Random Unmasking
+                unmasking_mask = torch.rand_like(current_state, dtype=torch.float) < unmasking_prob
+            update_mask = unmasking_mask & remasking_mask & initial_mask
+            if allow_edits: # Apply Seed Diffusion Editing Logic (Section 3.1 in https://arxiv.org/pdf/2508.02193)
+                alpha_t = 0.1 * (1 - step / num_steps)  # alpha_t decreases from 0.1 to 0 (Seed Diffusion)
+                edit_mask = torch.rand_like(current_state, dtype=torch.float) < alpha_t
+                is_visible = (current_state != self.tokenizer.mask_token_id) & \
+                             (current_state != self.tokenizer.pad_token_id) & \
+                             (current_state != self.tokenizer.eos_token_id)
+                edit_mask = is_visible & edit_mask & initial_mask # Use initial_mask to avoid editing original prompt
+                # Combine both masks
+                update_mask = update_mask | edit_mask
+            # Update current state
+            current_state[update_mask] = sampled_ids[update_mask]
+            yield current_state.clone() # Yield after each step
+    @torch.no_grad()
+    def _forward(
+        self,
+        model_inputs: torch.Tensor,
+        num_steps: int = 50,
+        allow_edits: bool = True,
+        use_confidence: bool = False,
+        stop_token: None = None
+    ) -> dict[str, Any]:
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer was not passed to the pipeline!")
+        input_ids = model_inputs["input_ids"]
+        all_states = list(self.diffusion_generator(input_ids=input_ids, num_steps=num_steps, allow_edits=allow_edits, use_confidence=use_confidence))
+        final_state = all_states[-1]
+        return {"final_state": final_state, "history": all_states}
+    @torch.no_grad()
+    def stream_generation(
+        self,
+        input_text: str,
+        num_steps: int = 50,
+        allow_edits: bool = True,
+        use_confidence: bool = False,
+        max_length: int | None = None,
+        stop_token: str | None = None
+    ) -> Generator[str, None, None]:
+        """
+        Public method to stream text generation step-by-step.
+        """
+        # 1. Preprocess
+        inputs = self.preprocess(input_text, max_length)
+        input_ids = inputs["input_ids"].to(self.model.device) # type: ignore
+        # 2. Iterate over generator
+        for step_tensor in self.diffusion_generator(input_ids=input_ids, num_steps=num_steps, allow_edits=allow_edits, use_confidence=use_confidence):
+            # Decode current state
+            text = self.tokenizer.decode(step_tensor[0], skip_special_tokens=False) # type: ignore
+            yield text
+        if stop_token is not None and stop_token in text[len(input_text):]:
+            text = input_text + text[len(input_text):].split(stop_token)[0]
+            yield text
+    def postprocess(self, model_outputs) -> list[str] | Any:
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer was not passed to the pipeline!")
+        # Convert final tensor to image/text
+        final_ids = model_outputs["final_state"]
+        return {
+            "decoded_texts": self.tokenizer.batch_decode(final_ids, skip_special_tokens=False),
+            "history": model_outputs["history"],
+            "final_ids": final_ids
+        }
+    @torch.no_grad()
+    def block_diffusion_generator(
+        self, input_ids: torch.Tensor,
+        block_size: int,
+        max_length: int,
+        num_steps: int,
+        allow_edits: bool = True,
+        use_confidence: bool = False,
+        stop_token: str | None = None
+    ) -> Generator[torch.Tensor, None, None]:
+        """
+        Generator that yields the diffusion states block-by-block.
+        Args:
+            input_ids (torch.Tensor): Initial input IDs with context.
+            block_size (int): Number of tokens to generate in each block.
+            max_length (int): Max length of the generated text.
+            num_steps (int): Number of diffusion steps per block.
+            allow_edits (bool): Whether to allow edits to existing tokens.
+            use_confidence (bool): Whether to use confidence-based unmasking.
+            stop_token (str | None): Token at which to stop generation early.
+        Yields:
+            torch.Tensor: The current state of the full sequence after each diffusion step.
+        """
+        assert num_steps > 0, "num_steps must be greater than 0"
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer was not passed to the pipeline!")
+        max_seq_length = self.model.config.seq_length if hasattr(self.model.config, "seq_length") else 512
+        stop_token_id = self.tokenizer.convert_tokens_to_ids(stop_token) if stop_token is not None else None
+        assert block_size > 0 and block_size <= max_seq_length, f"block_size must be in (0, {max_seq_length}]"
+        full_sequence = input_ids.clone()
+        current_length = input_ids.shape[1]
+        while current_length < max_length:
+            remaining = max_length - current_length
+            this_block_len = min(block_size, remaining)
+            if this_block_len <= 0: break
+            # Append MASK tokens for the new block
+            mask_block = torch.full(
+                (1, this_block_len),
+                self.tokenizer.mask_token_id, # type: ignore
+                dtype=torch.long,
+                device=self.model.device
+            )
+            # Combine Context + New Masks
+            input_ids = torch.cat([full_sequence[:, -(max_seq_length - this_block_len):], mask_block], dim=1)
+            for step_tensor in self.diffusion_generator(
+                input_ids,
+                num_steps=num_steps,
+                allow_edits=allow_edits,
+                use_confidence=use_confidence
+            ):
+                current_generated_tokens = step_tensor[:, -this_block_len:]
+                yield torch.cat([full_sequence, current_generated_tokens], dim=1)
+            if stop_token_id is not None and stop_token_id in current_generated_tokens:
+                # Stop if EOS is generated
+                eos_index = (current_generated_tokens == stop_token_id).nonzero(as_tuple=True)[1] # type: ignore
+                current_generated_tokens = current_generated_tokens[:, :eos_index[0]]
+                yield torch.cat([full_sequence, current_generated_tokens], dim=1)
+                break
+            # Update full sequence and current length
+            full_sequence = torch.cat([full_sequence, current_generated_tokens], dim=1)
+            current_length = full_sequence.shape[1]
+    @torch.no_grad()
+    def semi_autoregressive_generate(
+        self,
+        input_text: str,
+        block_size: int = 64,
+        max_length: int = 256,
+        num_steps: int = 50,
+        allow_edits: bool = True,
+        use_confidence: bool = False
+    ) -> dict[str, Any]:
+        """
+        Semi-Autoregressive Generation:
+        Generates text in blocks using the diffusion model.
+        Each block is generated by appending MASK tokens to the current context
+        and running the diffusion process on the combined sequence.
+        Args:
+            input_text (str): The initial prompt text.
+            block_size (int): Number of tokens to generate in each block.
+            max_length (int): Max length of the generated text.
+            num_steps (int): Number of diffusion steps per block.
+            allow_edits (bool): Whether to allow edits to existing tokens.
+            use_confidence (bool): Whether to use confidence-based unmasking.
+        Returns:
+            dict[str, Any]: A dictionary containing the decoded texts, generation history, and final token IDs.
+        """
+        if self.tokenizer is None: raise ValueError("No tokenizer")
+        input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(self.model.device) # type: ignore
+        all_states = list(self.block_diffusion_generator(input_ids, block_size, max_length, num_steps, allow_edits, use_confidence=use_confidence))
+        final_state = all_states[-1]
+        return {
+            "decoded_texts": self.tokenizer.batch_decode(final_state, skip_special_tokens=False),
+            "history": all_states,
+            "final_ids": final_state
+        }
+    @torch.no_grad()
+    def stream_semi_autoregressive_generate(
+        self,
+        input_text: str,
+        block_size: int = 64,
+        max_length: int = 256,
+        num_steps: int = 50,
+        allow_edits: bool = True,
+        use_confidence: bool = False,
+        stop_token: str | None = None
+    ) -> Generator[str, None, None]:
+        """
+        Streams the generation process block-by-block.
+        Yields the full decoded text at every diffusion step of every block.
+        Args:
+            input_text (str): The initial prompt text.
+            block_size (int): Number of tokens to generate in each block.
+            max_length (int): Max length of the generated text.
+            num_steps (int): Number of diffusion steps per block.
+            allow_edits (bool): Whether to allow edits to existing tokens.
+            use_confidence (bool): Whether to use confidence-based unmasking.
+            stop_token (None): Token at which to stop generation early.
+        Yields:
+            str: The current generated text after each diffusion step.
+        """
+        if self.tokenizer is None: raise ValueError("No tokenizer")
+        input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(self.model.device) # type: ignore
+        for step_tensor in self.block_diffusion_generator(input_ids, block_size, max_length, num_steps, allow_edits, use_confidence=use_confidence, stop_token=stop_token):
+            # Decode current state
+            yield self.tokenizer.decode(step_tensor[0], skip_special_tokens=False) # type: ignore

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50258": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50259": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50260": {
+      "content": "<|delete|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50262": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1024,
+  "pad_token": "<pad>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff