Spaces:

akhaliq
/

MMaDA-Parallel-A

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Nov 18, 2025

Commit

9b58924

verified ·

1 Parent(s): 6144131

Upload 22 files

Browse files

Files changed (23) hide show

.gitattributes +1 -0
app.py +724 -0
examples/image.png +3 -0
generators/__init__.py +4 -0
generators/image_generation_generator.py +251 -0
generators/parallel_generator.py +368 -0
inference.py +245 -0
model/__init__.py +1 -0
model/__pycache__/__init__.cpython-311.pyc +0 -0
model/__pycache__/configuration_llada.cpython-311.pyc +0 -0
model/__pycache__/modeling_llada.cpython-311.pyc +0 -0
model/__pycache__/modeling_xllmx_dimoo.cpython-311.pyc +0 -0
model/configuration_llada.py +463 -0
model/modeling_llada.py +1567 -0
model/modeling_xllmx_dimoo.py +202 -0
utils/__init__.py +4 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/generation_utils.cpython-311.pyc +0 -0
utils/__pycache__/image_utils.cpython-311.pyc +0 -0
utils/__pycache__/prompt_utils.cpython-311.pyc +0 -0
utils/generation_utils.py +89 -0
utils/image_utils.py +285 -0
utils/prompt_utils.py +233 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/image.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,724 @@

+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import gradio as gr
+import torch
+import math
+from PIL import Image
+from transformers import AutoTokenizer
+from model import LLaDAForMultiModalGeneration
+from utils.image_utils import (
+    decode_vq_to_image, calculate_vq_params,
+    generate_crop_size_list, var_center_crop, add_break_line,
+    encode_img_with_breaks, encode_img_with_paint
+)
+from utils.prompt_utils import generate_text_image_to_text_image_prompt
+import torch.nn.functional as F
+MODEL = None
+TOKENIZER = None
+VQVAE = None
+DEVICE = None
+CURRENT_MODEL_PATH = None
+SPECIAL_TOKENS = {
+    "mask_token": 126336,
+    "newline_token": 126084,
+    "image_token_offset": 126356,
+    "answer_start": 126354,
+    "answer_end": 126355,
+    "boi": 126349,
+    "eoi": 126350,
+    "uncondition": 126351
+}
+SYSTEM_PROMPT = "Generate an image applying the following editing instruction based on the original image."
+def cosine_schedule(t):
+    return torch.cos(t * math.pi / 2)
+def add_gumbel_noise(logits, temperature=1.0, generator=None):
+    if temperature == 0:
+        return logits
+    if generator is not None:
+        uniform_noise = torch.rand(logits.shape, dtype=logits.dtype, device=logits.device, generator=generator)
+    else:
+        uniform_noise = torch.rand_like(logits)
+    gumbel_noise = -torch.log(-torch.log(uniform_noise + 1e-10) + 1e-10)
+    return logits + temperature * gumbel_noise
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    if generator is not None:
+        noise = torch.randn(probs.shape, dtype=probs.dtype, device=probs.device, generator=generator)
+    else:
+        noise = torch.randn_like(probs)
+    confidence = torch.log(probs + 1e-10) + temperature * noise
+    sorted_confidence, sorted_indices = torch.sort(confidence, dim=-1, descending=False)
+    if isinstance(mask_len, torch.Tensor):
+        mask_len_clamped = torch.clamp(mask_len, 0, probs.shape[-1] - 1)
+        mask_len_clamped = mask_len_clamped.long().squeeze(-1)
+    else:
+        mask_len_clamped = int(mask_len)
+    if isinstance(mask_len_clamped, torch.Tensor):
+        batch = probs.shape[0]
+        masking = torch.zeros_like(probs, dtype=torch.bool, device=probs.device)
+        for b in range(batch):
+            k = mask_len_clamped[b].item()
+            if k <= 0:
+                continue
+            low_idx = sorted_indices[b, :k]
+            masking[b, low_idx] = True
+    else:
+        k = mask_len_clamped
+        if k <= 0:
+            masking = torch.zeros_like(probs, dtype=torch.bool, device=probs.device)
+        else:
+            low_idx = sorted_indices[:, :k]
+            masking = torch.zeros_like(probs, dtype=torch.bool, device=probs.device)
+            batch = probs.shape[0]
+            for b in range(batch):
+                masking[b, low_idx[b]] = True
+    return masking
+def get_num_transfer_tokens(text_masked_indices, text_steps):
+    batch_size = text_masked_indices.shape[0]
+    initial_masks = text_masked_indices.sum(dim=1)
+    num_transfer = torch.zeros(batch_size, text_steps, dtype=torch.long, device=text_masked_indices.device)
+    for b in range(batch_size):
+        total_masks = initial_masks[b].item()
+        remaining = total_masks
+        for step in range(text_steps):
+            ratio = (step + 1) / text_steps
+            target_remaining = int(total_masks * (1 - ratio))
+            tokens_to_unmask = max(0, remaining - target_remaining)
+            num_transfer[b, step] = tokens_to_unmask
+            remaining -= tokens_to_unmask
+    return num_transfer
+@torch.no_grad()
+def decode_text_with_masks(combined_input_ids, text_start, text_end, tokenizer, mask_token):
+    text_ids = combined_input_ids[0, text_start:text_end].cpu().tolist()
+    result_parts = []
+    consecutive_masks = 0
+    for token_id in text_ids:
+        if token_id == mask_token:
+            consecutive_masks += 1
+        else:
+            if consecutive_masks > 0:
+                if consecutive_masks <= 10:
+                    result_parts.append("▓" * consecutive_masks)
+                else:
+                    result_parts.append(f"▓▓▓▓▓[...{consecutive_masks - 5} more]")
+                consecutive_masks = 0
+            try:
+                token_text = tokenizer.decode([token_id], skip_special_tokens=False, clean_up_tokenization_spaces=False)
+                if token_text.strip() or token_text in [' ', '\n', '\t']:
+                    result_parts.append(token_text)
+            except:
+                result_parts.append(f"[{token_id}]")
+    if consecutive_masks > 0:
+        if consecutive_masks <= 10:
+            result_parts.append("▓" * consecutive_masks)
+        else:
+            result_parts.append(f"▓▓▓▓▓[...{consecutive_masks - 5} more]")
+    return "".join(result_parts)
+@torch.no_grad()
+def generate_ti2ti_stepwise(
+    model, input_ids, text_start, text_end, image_start, seq_len, newline_every,
+    text_steps=100, temperature=1.0, text_temperature=0.7, cfg_scale=0.0, cfg_img=4.0,
+    uncon_text=None, uncon_image=None, tokenizer=None, remasking='low_confidence',
+    noise_schedule=cosine_schedule, generator=None, text_vocab_size=126356,
+    codebook_size=8192, vqvae=None, image_height=512, image_width=512,
+):
+    device = input_ids.device
+    MASK_TOKEN = SPECIAL_TOKENS["mask_token"]
+    NEW_LINE = SPECIAL_TOKENS["newline_token"]
+    combined_input_ids = input_ids.clone()
+    num_vq_tokens = seq_len
+    total_image_len = seq_len + seq_len // newline_every
+    image_end = image_start + total_image_len
+    text_masked_indices = combined_input_ids[:, text_start:text_end] == MASK_TOKEN
+    num_transfer_tokens = get_num_transfer_tokens(text_masked_indices, text_steps)
+    image_generation_step_indices = torch.linspace(
+        0, text_steps - 1, int(text_steps * 0.3)
+    ).round().int().tolist()
+    image_position_mapping = []
+    for i in range(image_start, image_end):
+        if combined_input_ids[0, i] != NEW_LINE:
+            image_position_mapping.append(i)
+    batch_size = combined_input_ids.shape[0]
+    initial_text_display = decode_text_with_masks(combined_input_ids, text_start, text_end, tokenizer, MASK_TOKEN)
+    last_generated_image = None
+    yield 0, initial_text_display, None, f"Step 0/{text_steps}"
+    for step in range(text_steps):
+        cond_logits = model(combined_input_ids, infer=True, use_cache=False).logits
+        text_masked_indices = combined_input_ids[:, text_start:text_end] == MASK_TOKEN
+        if text_masked_indices.sum() > 0:
+            text_logits = cond_logits[:, text_start:text_end, :]
+            logits_with_noise = add_gumbel_noise(text_logits, temperature=text_temperature, generator=generator)
+            x0 = torch.argmax(logits_with_noise, dim=-1)
+            if remasking == 'low_confidence':
+                p = F.softmax(text_logits.to(torch.float64), dim=-1)
+                x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
+            elif remasking == 'random':
+                if generator is not None:
+                    x0_p = torch.rand(x0.shape, dtype=x0.dtype, device=x0.device, generator=generator)
+                else:
+                    x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+            else:
+                x0_p = torch.ones_like(x0, dtype=torch.float)
+            x0 = torch.where(text_masked_indices, x0, combined_input_ids[:, text_start:text_end])
+            confidence = torch.where(text_masked_indices, x0_p, float('-inf'))
+            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+            for j in range(confidence.shape[0]):
+                k = num_transfer_tokens[j, step].item()
+                if k > 0:
+                    _, select_index = torch.topk(confidence[j], k=k)
+                    transfer_index[j, select_index] = True
+            combined_input_ids[:, text_start:text_end][transfer_index] = x0[transfer_index]
+        if step in image_generation_step_indices:
+            vq_tokens_list = []
+            mask_positions = []
+            for idx, pos in enumerate(image_position_mapping):
+                token = combined_input_ids[0, pos].item()
+                if token == MASK_TOKEN:
+                    vq_tokens_list.append(-1)
+                    mask_positions.append(idx)
+                else:
+                    vq_token = token - text_vocab_size
+                    vq_token = max(0, min(vq_token, codebook_size - 1))
+                    vq_tokens_list.append(vq_token)
+            vq_tokens_tensor = torch.tensor(vq_tokens_list, device=device).unsqueeze(0)
+            unknown_map = vq_tokens_tensor == -1
+            cond_image_logits_list = []
+            for pos in image_position_mapping:
+                cond_image_logits_list.append(
+                    cond_logits[:, pos:pos+1, text_vocab_size:text_vocab_size+codebook_size]
+                )
+            cond_vq_logits = torch.cat(cond_image_logits_list, dim=1)
+            if (cfg_scale > 0.0 and uncon_text is not None) or (cfg_img > 0.0 and uncon_image is not None):
+                if uncon_text is None:
+                    combined_uncond_text = combined_input_ids.clone()
+                else:
+                    combined_uncond_text = combined_input_ids.clone()
+                    prefix_len = uncon_text.shape[1]
+                    combined_uncond_text[:, :prefix_len] = uncon_text.to(device)
+                if uncon_image is None:
+                    combined_uncond_img = combined_input_ids.clone()
+                else:
+                    combined_uncond_img = combined_input_ids.clone()
+                    prefix_len_img = uncon_image.shape[1]
+                    combined_uncond_img[:, :prefix_len_img] = uncon_image.to(device)
+                uncond_text_logits_full = model(combined_uncond_text, infer=True, use_cache=False).logits
+                uncond_img_logits_full = model(combined_uncond_img, infer=True, use_cache=False).logits
+                uncond_text_vq_list = []
+                uncond_img_vq_list = []
+                for pos in image_position_mapping:
+                    uncond_text_vq_list.append(
+                        uncond_text_logits_full[:, pos:pos+1, text_vocab_size:text_vocab_size+codebook_size]
+                    )
+                    uncond_img_vq_list.append(
+                        uncond_img_logits_full[:, pos:pos+1, text_vocab_size:text_vocab_size+codebook_size]
+                    )
+                uncond_text_vq_logits = torch.cat(uncond_text_vq_list, dim=1)
+                uncond_img_vq_logits = torch.cat(uncond_img_vq_list, dim=1)
+            else:
+                uncond_text_vq_logits = torch.zeros_like(cond_vq_logits)
+                uncond_img_vq_logits = torch.zeros_like(cond_vq_logits)
+            image_logits = cond_vq_logits
+            if cfg_scale != 0.0:
+                image_logits = image_logits + cfg_scale * (cond_vq_logits - uncond_text_vq_logits)
+            if cfg_img != 0.0:
+                image_logits = image_logits + cfg_img * (cond_vq_logits - uncond_img_vq_logits)
+            probs = F.softmax(image_logits, dim=-1)
+            if temperature == 0:
+                sampled_ids = probs.argmax(dim=-1)
+            else:
+                sampled = probs.reshape(-1, image_logits.size(-1))
+                if generator is not None:
+                    sampled_ids = torch.multinomial(sampled, 1, generator=generator)[:, 0].view(*image_logits.shape[:-1])
+                else:
+                    sampled_ids = torch.multinomial(sampled, 1)[:, 0].view(*image_logits.shape[:-1])
+            sampled_ids = torch.where(unknown_map, sampled_ids, vq_tokens_tensor)
+            sampled_ids = torch.clamp(sampled_ids, 0, codebook_size - 1)
+            selected_probs = torch.gather(probs, -1, sampled_ids.long()[..., None]).squeeze(-1)
+            high_val = torch.finfo(selected_probs.dtype).max
+            selected_probs = torch.where(unknown_map, selected_probs, high_val)
+            ratio = 1.0 * (step + 1) / text_steps
+            mask_ratio = noise_schedule(torch.tensor(ratio, device=device))
+            unknown_counts = unknown_map.sum(dim=-1, keepdim=True)
+            mask_len = (num_vq_tokens * mask_ratio).floor().unsqueeze(0).to(device)
+            mask_len = torch.max(torch.tensor([1], device=device), torch.min(unknown_counts - 1, mask_len.to(device).long()))
+            if mask_len.ndim == 1:
+                mask_len = mask_len.unsqueeze(1)
+            img_temp = temperature * (1.0 - ratio)
+            masking = mask_by_random_topk(mask_len, selected_probs, img_temp, generator=generator)
+            final_vq_tokens = torch.where(masking, torch.tensor(-1, device=device), sampled_ids)
+            for idx, pos in enumerate(image_position_mapping):
+                v = final_vq_tokens[0, idx].item()
+                if v == -1:
+                    combined_input_ids[0, pos] = MASK_TOKEN
+                else:
+                    combined_input_ids[0, pos] = int(v + text_vocab_size)
+            try:
+                decoded_image = decode_vq_to_image(
+                    sampled_ids, None, None, image_height, image_width, vqvae
+                )
+                masked_positions_bool = masking[0]
+                if masked_positions_bool.sum() > 0:
+                    from PIL import ImageDraw
+                    decoded_image = decoded_image.copy()
+                    draw = ImageDraw.Draw(decoded_image, 'RGBA')
+                    vae_scale = 2 ** (len(VQVAE.config.block_out_channels) - 1)
+                    token_h = image_height // vae_scale
+                    token_w = image_width // vae_scale
+                    pixel_h = image_height // token_h
+                    pixel_w = image_width // token_w
+                    masked_indices = torch.where(masked_positions_bool)[0].cpu().tolist()
+                    for masked_idx in masked_indices:
+                        token_row = masked_idx // token_w
+                        token_col = masked_idx % token_w
+                        y1 = token_row * pixel_h
+                        x1 = token_col * pixel_w
+                        y2 = y1 + pixel_h
+                        x2 = x1 + pixel_w
+                        draw.rectangle([x1, y1, x2, y2], fill=(128, 128, 128, 120))
+                last_generated_image = decoded_image
+            except Exception as e:
+                pass
+        text_display = decode_text_with_masks(combined_input_ids, text_start, text_end, tokenizer, MASK_TOKEN)
+        text_masks_remaining = (combined_input_ids[:, text_start:text_end] == MASK_TOKEN).sum().item()
+        text_progress = (1 - text_masks_remaining / (text_end - text_start)) * 100
+        status_msg = f"Step {step + 1}/{text_steps} | Text: {text_progress:.1f}%"
+        if step in image_generation_step_indices:
+            image_masks_remaining = sum(1 for pos in image_position_mapping if combined_input_ids[0, pos] == MASK_TOKEN)
+            image_progress = (1 - image_masks_remaining / num_vq_tokens) * 100
+            status_msg += f" | Image: {image_progress:.1f}%"
+        if step % 5 == 0 or step in image_generation_step_indices or step == text_steps - 1:
+            yield step + 1, text_display, last_generated_image, status_msg
+    final_text_display = decode_text_with_masks(combined_input_ids, text_start, text_end, tokenizer, MASK_TOKEN)
+    if last_generated_image is not None:
+        final_image = last_generated_image
+    else:
+        final_vq_tokens = []
+        final_mask_positions = []
+        for idx, pos in enumerate(image_position_mapping):
+            token = combined_input_ids[0, pos].item()
+            if token != MASK_TOKEN:
+                vq_token = token - text_vocab_size
+                vq_token = max(0, min(vq_token, codebook_size - 1))
+                final_vq_tokens.append(vq_token)
+            else:
+                final_vq_tokens.append(codebook_size // 2)
+                final_mask_positions.append(idx)
+        vq_tensor = torch.tensor(final_vq_tokens, dtype=torch.long, device=device).unsqueeze(0)
+        final_image = decode_vq_to_image(vq_tensor, None, None, image_height, image_width, vqvae)
+        if final_mask_positions:
+            from PIL import ImageDraw
+            final_image = final_image.copy()
+            draw = ImageDraw.Draw(final_image, 'RGBA')
+            vae_scale = 2 ** (len(VQVAE.config.block_out_channels) - 1)
+            token_h = image_height // vae_scale
+            token_w = image_width // vae_scale
+            pixel_h = image_height // token_h
+            pixel_w = image_width // token_w
+            for masked_idx in final_mask_positions:
+                token_row = masked_idx // token_w
+                token_col = masked_idx % token_w
+                y1 = token_row * pixel_h
+                x1 = token_col * pixel_w
+                y2 = y1 + pixel_h
+                x2 = x1 + pixel_w
+                draw.rectangle([x1, y1, x2, y2], fill=(128, 128, 128, 120))
+    yield text_steps, final_text_display, final_image, "✓ Complete"
+def load_model_and_vae(model_path, vae_path):
+    global MODEL, TOKENIZER, VQVAE, DEVICE, CURRENT_MODEL_PATH
+    if MODEL is not None and CURRENT_MODEL_PATH == model_path:
+        return f"Model already loaded: {model_path}"
+    try:
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+        TOKENIZER = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        MODEL = LLaDAForMultiModalGeneration.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, device_map="auto"
+        )
+        MODEL.eval()
+        from diffusers import VQModel
+        VQVAE = VQModel.from_pretrained(vae_path, subfolder="vqvae").to(DEVICE)
+        CURRENT_MODEL_PATH = model_path
+        return f"✓ Model loaded | Device: {DEVICE}"
+    except Exception as e:
+        MODEL = None
+        TOKENIZER = None
+        VQVAE = None
+        CURRENT_MODEL_PATH = None
+        return f"✗ Failed: {str(e)}"
+def generate_wrapper(
+    input_image, prompt_text, model_path, vae_path, height, width,
+    text_steps, text_gen_length, text_block_length, cfg_scale, cfg_img,
+    temperature, text_temperature, remasking_strategy, painting_mode,
+    mask_h_ratio, mask_w_ratio, seed,
+):
+    global MODEL, TOKENIZER, VQVAE, DEVICE
+    if MODEL is None or TOKENIZER is None or VQVAE is None:
+        load_status = load_model_and_vae(model_path, vae_path)
+        if "Failed" in load_status:
+            yield "", None, load_status
+            return
+    if input_image is None:
+        yield "", None, "✗ No input image"
+        return
+    if seed != 0:
+        torch.manual_seed(seed)
+        generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    else:
+        generator = None
+    MASK = SPECIAL_TOKENS["mask_token"]
+    NEW_LINE = SPECIAL_TOKENS["newline_token"]
+    BOA = SPECIAL_TOKENS["answer_start"]
+    EOA = SPECIAL_TOKENS["answer_end"]
+    BOI = SPECIAL_TOKENS["boi"]
+    EOI = SPECIAL_TOKENS["eoi"]
+    try:
+        input_prompt, uncon_text = generate_text_image_to_text_image_prompt(
+            prompt_text, SYSTEM_PROMPT
+        )
+        prompt_ids = TOKENIZER(input_prompt)["input_ids"]
+        uncon_text_ids = TOKENIZER(uncon_text)["input_ids"]
+        img = input_image.convert("RGB")
+        crop_size_list = generate_crop_size_list((512 // 32) ** 2, 32)
+        img = var_center_crop(img, crop_size_list=crop_size_list)
+        input_img_token = encode_img_with_breaks(img, VQVAE)
+        con_input_list = prompt_ids[:-1] + input_img_token + prompt_ids[-1:]
+        uncon_input_text = uncon_text_ids[:-1] + input_img_token + uncon_text_ids[-1:]
+        uncon_input_image = prompt_ids
+        vae_scale = 2 ** (len(VQVAE.config.block_out_channels) - 1)
+        seq_len, newline_every, token_grid_height, token_grid_width = calculate_vq_params(
+            height, width, vae_scale
+        )
+        text_mask_tokens = [MASK] * text_gen_length
+        if painting_mode:
+            img_mask_token, img_vis = encode_img_with_paint(
+                img, vqvae=VQVAE, mask_h_ratio=mask_h_ratio,
+                mask_w_ratio=mask_w_ratio, mask_mode=painting_mode
+            )
+        else:
+            img_mask_token = add_break_line(
+                [MASK] * seq_len, token_grid_height, token_grid_width,
+                new_number=NEW_LINE
+            )
+        end_token_ids = TOKENIZER("</answer>", add_special_tokens=False).input_ids
+        pred_token = [BOA] + [BOI] + img_mask_token + [EOI] + text_mask_tokens + end_token_ids
+        code_start = len(con_input_list)
+        image_start = len(con_input_list) + 2
+        image_end = image_start + len(img_mask_token)
+        text_start = image_end + 1
+        text_end = text_start + text_gen_length
+        full_input_ids = con_input_list + pred_token
+        con_input = torch.tensor(full_input_ids, device=DEVICE).unsqueeze(0)
+        uncon_input_text_tensor = torch.tensor(uncon_input_text, device=DEVICE).unsqueeze(0)
+        uncon_input_image_tensor = torch.tensor(uncon_input_image, device=DEVICE).unsqueeze(0)
+        config = MODEL.config
+        text_vocab_size = getattr(config, 'text_vocab_size', 126356)
+        codebook_size = getattr(config, 'codebook_size', 8192)
+        for step, text_display, image, status in generate_ti2ti_stepwise(
+            model=MODEL, input_ids=con_input, text_start=text_start, text_end=text_end,
+            image_start=image_start, seq_len=seq_len, newline_every=newline_every,
+            text_steps=text_steps, temperature=temperature, text_temperature=text_temperature,
+            cfg_scale=cfg_scale, cfg_img=cfg_img, uncon_text=uncon_input_text_tensor,
+            uncon_image=uncon_input_image_tensor, tokenizer=TOKENIZER,
+            remasking=remasking_strategy, noise_schedule=cosine_schedule,
+            generator=generator, text_vocab_size=text_vocab_size,
+            codebook_size=codebook_size, vqvae=VQVAE,
+            image_height=height, image_width=width,
+        ):
+            yield text_display, image, status
+    except Exception as e:
+        import traceback
+        yield "", None, f"✗ Error: {str(e)}"
+css_styles = """
+.gradio-container {
+    font-family: 'IBM Plex Sans', sans-serif;
+    max-width: 1400px !important;
+    margin: auto;
+}
+.gr-button-primary {
+    background: linear-gradient(90deg, #7c3aed 0%, #a855f7 100%) !important;
+    border: none !important;
+    color: white !important;
+}
+.gr-button-primary:hover {
+    transform: scale(1.02);
+    box-shadow: 0 4px 12px rgba(124, 58, 237, 0.4) !important;
+}
+.output-markdown {
+    min-height: 400px !important;
+    max-height: 600px !important;
+    overflow-y: auto !important;
+    padding: 12px !important;
+    background: #fafafa !important;
+    border-radius: 8px !important;
+    border: 1px solid #e0e0e0 !important;
+    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace !important;
+    font-size: 13px !important;
+    line-height: 1.5 !important;
+}
+.output-markdown .prose,
+.output-markdown .prose * {
+    font-size: 10px !important;
+    line-height: 1.4 !important;
+}
+.output-markdown h1 {
+    font-size: 1.4em !important;
+    margin-top: 0.8em !important;
+    margin-bottom: 0.4em !important;
+    color: #333 !important;
+}
+.output-markdown h2 {
+    font-size: 1.2em !important;
+    margin-top: 0.8em !important;
+    margin-bottom: 0.4em !important;
+    color: #333 !important;
+}
+.output-markdown h3 {
+    font-size: 1.1em !important;
+    margin-top: 0.8em !important;
+    margin-bottom: 0.4em !important;
+    color: #333 !important;
+}
+.output-markdown code {
+    background: #f0f0f0 !important;
+    padding: 2px 4px !important;
+    border-radius: 3px !important;
+    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace !important;
+    font-size: 12px !important;
+}
+.output-markdown pre {
+    background: #f5f5f5 !important;
+    padding: 8px !important;
+    border-radius: 5px !important;
+    overflow-x: auto !important;
+    font-size: 12px !important;
+}
+.output-markdown ul, .output-markdown ol {
+    padding-left: 18px !important;
+    margin: 8px 0 !important;
+}
+.output-markdown li {
+    margin: 4px 0 !important;
+}
+.output-markdown p {
+    margin: 6px 0 !important;
+}
+.output-markdown strong {
+    font-weight: 600 !important;
+}
+footer {display: none !important}
+"""
+with gr.Blocks(css=css_styles, theme=gr.themes.Soft(primary_hue="purple")) as demo:
+    gr.Markdown(
+        """
+        # 🎨 MMaDA-Parallel: Text+Image to Text+Image Generation
+        Real-time parallel generation with step-by-step visualization.
+        **Github:** [tyfeld/MMaDA-Parallel-A](https://github.com/tyfeld/MMaDA-Parallel-A)
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Input")
+            input_image = gr.Image(type="pil", label="Input Image")
+            prompt_text = gr.Textbox(
+                label="Editing Instruction",
+                lines=3,
+                value="Make the sky more dramatic with sunset colors",
+                placeholder="Enter your editing instruction..."
+            )
+            with gr.Accordion("Model", open=False):
+                model_path = gr.Textbox(
+                    label="Model Path",
+                    value="tyfeld/MMaDA-Parallel-A",
+                    info="HuggingFace path or local directory"
+                )
+                vae_path = gr.Textbox(
+                    label="VAE Path",
+                    value="tyfeld/MMaDA-Parallel-A",
+                    info="VQ-VAE checkpoint path"
+                )
+            with gr.Accordion("Parameters", open=False):
+                with gr.Row():
+                    height = gr.Slider(256, 768, value=512, step=64, label="Height")
+                    width = gr.Slider(256, 768, value=512, step=64, label="Width")
+                text_steps = gr.Slider(32, 512, value=128, step=32, label="Steps")
+                text_gen_length = gr.Slider(64, 512, value=256, step=32, label="Text Length")
+                text_block_length = gr.Slider(16, 128, value=32, step=16, label="Block Length")
+                with gr.Row():
+                    cfg_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Text CFG")
+                    cfg_img = gr.Slider(0, 8, value=4.0, step=0.5, label="Image CFG")
+                with gr.Row():
+                    temperature = gr.Slider(0, 2, value=1.0, step=0.1, label="Image Temp")
+                    text_temperature = gr.Slider(0, 2, value=0.7, step=0.1, label="Text Temp")
+                remasking_strategy = gr.Dropdown(
+                    choices=["low_confidence", "random"],
+                    value="low_confidence",
+                    label="Remasking"
+                )
+                seed = gr.Slider(0, 10000, value=0, step=1, label="Seed (0=random)")
+            with gr.Accordion("Painting Mode", open=False):
+                painting_mode = gr.Dropdown(
+                    choices=[None, "inpainting", "outpainting"],
+                    value=None,
+                    label="Mode"
+                )
+                with gr.Row():
+                    mask_h_ratio = gr.Slider(0.1, 1.0, value=0.5, step=0.1, label="Mask H")
+                    mask_w_ratio = gr.Slider(0.1, 1.0, value=0.5, step=0.1, label="Mask W")
+            generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            gr.Markdown("### Output")
+            status_text = gr.Textbox(label="Status", lines=2, interactive=False)
+            with gr.Row():
+                with gr.Column(scale=1.2):
+                    output_text = gr.Markdown(
+                        value="*Waiting...*",
+                        label="Generated Text (▓ = masked)",
+                        show_label=True,
+                        container=True,
+                        elem_classes=["output-markdown"]
+                    )
+                with gr.Column(scale=1):
+                    output_image = gr.Image(label="Generated Image", type="pil", interactive=False)
+    generate_btn.click(
+        fn=generate_wrapper,
+        inputs=[
+            input_image, prompt_text, model_path, vae_path,
+            height, width, text_steps, text_gen_length, text_block_length,
+            cfg_scale, cfg_img, temperature, text_temperature,
+            remasking_strategy, painting_mode, mask_h_ratio, mask_w_ratio, seed
+        ],
+        outputs=[output_text, output_image, status_text]
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="MMaDA-Parallel Gradio Demo")
+    parser.add_argument("--model_path", type=str, default="tyfeld/MMaDA-Parallel-A")
+    parser.add_argument("--vae_path", type=str, default="tyfeld/MMaDA-Parallel-A")
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--port", type=int, default=7860)
+    args = parser.parse_args()
+    print("Loading model...")
+    load_status = load_model_and_vae(args.model_path, args.vae_path)
+    print(load_status)
+    demo.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)

examples/image.png ADDED Viewed

Git LFS Details

SHA256: 526888582775ba2878424e37824b5e0e21ec80d2e631af0fd26b8f0dc1e00dfd
Pointer size: 131 Bytes
Size of remote file: 264 kB

generators/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# -*- coding: utf-8 -*-
+"""
+Generator modules
+"""

generators/image_generation_generator.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# -*- coding: utf-8 -*-
+"""
+Image generation generator (with optional debug prints/saving)
+"""
+import torch
+import math
+import os
+import numpy as np
+from typing import Callable, Optional
+from utils.generation_utils import cosine_schedule, gumbel_max_sample, mask_by_random_topk
+from model import LLaDAForMultiModalGeneration
+@torch.no_grad()
+def generate_image(
+    model,
+    prompt: torch.LongTensor,
+    *,
+    seq_len: int = 1024,
+    newline_every: int = 16,
+    timesteps: int = 18,
+    mask_token_id: int = 126336,
+    newline_id: int = 126084,
+    temperature: float = 1.0,
+    cfg_scale: float = 0.0,
+    uncon_ids: torch.LongTensor = None,
+    code_start: Optional[int] = None,
+    codebook_size: int = 8192,
+    noise_schedule: Callable[[torch.Tensor], torch.Tensor] = cosine_schedule,
+    text_vocab_size: Optional[int] = None,
+    generator: Optional[torch.Generator] = None,
+    use_cache=False,
+    cache_ratio=0.9,
+    refresh_interval=5,
+    warmup_ratio=0.3,
+    debug: bool = True,
+    debug_log_dir: Optional[str] = None,
+    max_print_tokens: int = 100
+) -> torch.LongTensor:
+    """
+    MaskGit parallel decoding to generate VQ tokens
+    Added debug=True to print shapes and token samples per step. Optional debug_log_dir to save numpy dumps.
+    Args:
+        debug: when True, print detailed info each step.
+        debug_log_dir: directory to save per-step npy dumps (x, vq_mask, logits, sampled_full)
+        max_print_tokens: maximum number of tokens/logits to print for arrays (prevents terminal spam)
+    """
+    if debug and debug_log_dir:
+        os.makedirs(debug_log_dir, exist_ok=True)
+    device = next(model.parameters()).device
+    prompt = prompt.to(device)
+    B, P = prompt.shape
+    assert B == 1, "batch>1 not supported – wrap in loop if needed"
+    x = prompt.clone()
+    vq_mask = x == mask_token_id
+    unknown_cnt = vq_mask.sum(dim=1, keepdim=True)
+    vq_len = unknown_cnt
+    if isinstance(model, LLaDAForMultiModalGeneration):
+        model.caching(use_cache)
+    else:  # DDP
+        model.module.caching(use_cache)
+    warmup_step = int(timesteps * warmup_ratio)
+    refresh_steps = torch.zeros(timesteps, dtype=torch.bool)
+    for step in range(timesteps):
+        if not use_cache or step <= warmup_step or (step-warmup_step) % refresh_interval == 0:
+            refresh_steps[step] = True
+    compute_ratio = 1 - cache_ratio
+    # Infer text vocabulary size
+    if text_vocab_size is None:
+        # call with a minimal input to get logits size
+        vocab_total = model(torch.zeros(1, 1, dtype=torch.long, device=device), infer=True).logits.size(-1)
+        text_vocab_size = vocab_total - codebook_size
+    vocab_offset = text_vocab_size
+    if debug:
+        print("=== generate_image debug start ===")
+        print(f"device={device}, seq_len={seq_len}, code_start={code_start}, codebook_size={codebook_size}")
+        print(f"text_vocab_size={text_vocab_size}, vocab_offset={vocab_offset}")
+        print(f"Initial x.shape={x.shape}, initial unknown_cnt={int(unknown_cnt.item())}")
+        print("==================================")
+    for step in range(timesteps):
+        if unknown_cnt.item() == 0:
+            if debug:
+                print(f"[step {step}] All tokens filled, breaking early.")
+            break
+        # Calculate number of tokens to keep (continue masking) this round
+        if step < timesteps - 1:
+            frac = noise_schedule(torch.tensor([(step + 1) / timesteps], device=device))
+            keep_n = (vq_len.float() * frac).floor().clamp_min(1).long()
+        else:
+            keep_n = torch.zeros_like(unknown_cnt)
+        if use_cache and step and refresh_steps[step]:
+            if isinstance(model, LLaDAForMultiModalGeneration):
+                model.empty_cache()
+            else:  # DDP
+                model.module.empty_cache()
+        if debug:
+            print(f"\n--- step {step} ---")
+            print(f"unknown_cnt={int(unknown_cnt.item())}, keep_n={int(keep_n.item())}, refresh_step={bool(refresh_steps[step])}")
+            print(f"x.shape={x.shape}, vq_mask.sum()={int(vq_mask.sum().item())}")
+            # print a slice of tokens around code_start for visibility if code_start is set
+            if code_start is not None:
+                cs = code_start
+                sample_slice = x[0, cs:cs+min(50, x.shape[1]-cs)].detach().cpu().numpy().tolist()
+                print(f"x tokens at code_start (first 50): {sample_slice[:min(len(sample_slice), max_print_tokens)]}")
+        # Forward pass (with/without CFG)
+        if cfg_scale > 0:
+            # build uncond sequence
+            uncond = torch.cat((uncon_ids.to(x.device), x[:, code_start-2:]), axis=1)
+            uncond_vq_mask = torch.cat((torch.zeros((1, uncon_ids.size()[1]), dtype=torch.bool).to(x.device), vq_mask[:, code_start-2:]), axis=1)
+            # conditional logits
+            cond_out = model(x, infer=True, use_cache=use_cache)
+            cond_logits = cond_out.logits[..., vocab_offset : vocab_offset + codebook_size]
+            if debug:
+                print(f"cond_logits shape: {cond_logits.shape}")
+            cond_mask_logits = cond_logits[vq_mask].view(B, -1, codebook_size)
+            """
+            if debug:
+                print(f"cond_mask_logits shape (after vq_mask): {tuple(cond_mask_logits.shape)}")
+                # print few values
+                tmp = cond_mask_logits.detach().cpu().numpy()
+                flat_tmp = tmp.reshape(-1, tmp.shape[-1])
+                if flat_tmp.shape[0] > 0:
+                    print("cond_mask_logits[first_row, first_10]:", flat_tmp[0, :min(10, flat_tmp.shape[1])].tolist())
+"""
+            # unconditional logits
+            uncond_out = model(uncond, infer=True, use_cache=use_cache)
+            uncond_logits = uncond_out.logits[..., vocab_offset : vocab_offset + codebook_size]
+            if debug:
+                print(f"uncond_logits shape: {uncond_logits.shape}")
+            uncond_mask_logits = uncond_logits[uncond_vq_mask].view(B, -1, codebook_size)
+            """
+            if debug:
+                print(f"uncond_mask_logits shape: {tuple(uncond_mask_logits.shape)}")
+                tmpu = uncond_mask_logits.detach().cpu().numpy()
+                if tmpu.size:
+                    print("uncond_mask_logits[first_row, first_10]:", tmpu.reshape(-1, tmpu.shape[-1])[0, :min(10, tmpu.shape[-1])].tolist())
+"""
+            logits = (1 + cfg_scale) * cond_mask_logits - cfg_scale * uncond_mask_logits
+            if debug:
+                print(f"combined logits shape: {logits.shape}")
+        else:
+            out = model(x, infer=True)
+            # logits for masked positions: (B, num_masked, codebook_size)
+            # here we index directly by boolean mask along sequence dim
+            logits = out.logits[:, vq_mask[0], vocab_offset : vocab_offset + codebook_size]
+            if debug:
+                print(f"logits shape (no-cfg): {logits.shape}")
+                ltmp = logits.detach().cpu().numpy()
+                if ltmp.size:
+                    print("logits[first_pos, first_10]:", ltmp[0, :min(10, ltmp.shape[1])].tolist() if ltmp.ndim == 2 else ltmp.reshape(-1, ltmp.shape[-1])[0, :min(10, ltmp.shape[-1])].tolist())
+        # sample
+        sampled = gumbel_max_sample(logits, temperature, generator=generator)
+        sampled_full = sampled + vocab_offset  # bring to full token space
+        probs = torch.softmax(logits, dim=-1)
+        conf = probs.gather(-1, sampled.unsqueeze(-1)).squeeze(-1)
+        if debug:
+            print(f"sampled.shape={sampled.shape}, sampled_full.shape={sampled_full.shape}, conf.shape={conf.shape}")
+            # print some sampled tokens
+            sf_np = sampled_full.detach().cpu().numpy().reshape(-1).tolist()
+            print(f"sampled_full(first {min(len(sf_np), max_print_tokens)}): {sf_np[:min(len(sf_np), max_print_tokens)]}")
+        # write sampled tokens into x at masked positions
+        flat_idx = vq_mask.nonzero(as_tuple=False)[:, 1]
+        if debug:
+            print(f"flat_idx (masked positions indices) length={flat_idx.shape[0]}")
+            if flat_idx.numel() > 0:
+                print(f"flat_idx first 30: {flat_idx[:min(30, flat_idx.shape[0])].detach().cpu().numpy().tolist()}")
+        x.view(-1)[flat_idx] = sampled_full.view(-1)
+        # confidence map (for display / selection)
+        conf_map = torch.full_like(x, -math.inf, dtype=probs.dtype)
+        conf_map.view(-1)[flat_idx] = conf.view(-1)
+        if debug:
+            # show some stats of conf_map in code region
+            try:
+                conf_np = conf.detach().cpu().numpy().reshape(-1)
+                print(f"conf stats (min/mean/max): {float(conf_np.min()):.6f}/{float(conf_np.mean()):.6f}/{float(conf_np.max()):.6f}")
+            except Exception:
+                pass
+        # mask selection -> re-mask some tokens for next step
+        mask_sel = mask_by_random_topk(keep_n.squeeze(1), conf, temperature=temperature, generator=generator)
+        if debug:
+            print(f"mask_sel.shape={mask_sel.shape}, mask_sel.sum()={int(mask_sel.sum().item())}")
+        x.view(-1)[flat_idx[mask_sel.view(-1)]] = mask_token_id
+        vq_mask = x == mask_token_id
+        unknown_cnt = vq_mask.sum(dim=1, keepdim=True)
+        if debug:
+            print(f"after masking, vq_mask.sum()={int(vq_mask.sum().item())}, unknown_cnt={int(unknown_cnt.item())}")
+        # Save debug artifacts if requested
+        if debug and debug_log_dir:
+            step_base = os.path.join(debug_log_dir, f"step_{step}")
+            try:
+                np.save(step_base + "_x.npy", x.detach().cpu().numpy())
+                np.save(step_base + "_vq_mask.npy", vq_mask.detach().cpu().numpy())
+                # logits may be large; save as float32
+                np.save(step_base + "_logits.npy", logits.detach().cpu().numpy().astype(np.float32))
+                np.save(step_base + "_sampled_full.npy", sampled_full.detach().cpu().numpy())
+            except Exception as e:
+                print(f"[debug] failed to save debug npy at step {step}: {e}")
+        # Update cond/uncond compute masks for caching only if cfg_scale>0
+        if use_cache and step < timesteps - 1 and not refresh_steps[step+1] and cfg_scale > 0:
+            cond_conf = cond_logits.max(dim=-1)[0]
+            cond_conf_threshold = torch.quantile(cond_conf.to(torch.float), compute_ratio, dim=-1, keepdim=True)
+            cond_to_compute_mask = cond_conf <= cond_conf_threshold
+            uncond_conf = uncond_logits.max(dim=-1)[0]
+            uncond_conf_threshold = torch.quantile(uncond_conf.to(torch.float), compute_ratio, dim=-1, keepdim=True)
+            uncond_to_compute_mask = uncond_conf <= uncond_conf_threshold
+            if debug:
+                print(f"cond_conf shape: {cond_conf.shape}, threshold: {cond_conf_threshold.detach().cpu().numpy().tolist()}")
+                print(f"uncond_conf shape: {uncond_conf.shape}, threshold: {uncond_conf_threshold.detach().cpu().numpy().tolist()}")
+    # Remove newline tokens and shape properly
+    vq_ids = x[0, code_start:-2]
+    vq_ids = vq_ids[vq_ids != newline_id].view(1, seq_len)
+    if debug:
+        print("=== generate_image debug end ===")
+        print(f"final vq_ids.shape={vq_ids.shape}")
+        try:
+            print("final vq_ids first 100:", vq_ids.detach().cpu().numpy().reshape(-1)[:min(max_print_tokens, vq_ids.numel())].tolist())
+        except Exception:
+            pass
+    return vq_ids

generators/parallel_generator.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+import math
+import numpy as np
+def add_gumbel_noise(logits, temperature=1.0, generator=None):
+    """Add Gumbel noise to logits for sampling"""
+    if temperature == 0:
+        return logits
+    if generator is not None:
+        uniform_noise = torch.rand(logits.shape, dtype=logits.dtype, device=logits.device, generator=generator)
+    else:
+        uniform_noise = torch.rand_like(logits)
+    gumbel_noise = -torch.log(-torch.log(uniform_noise + 1e-10) + 1e-10)
+    return logits + temperature * gumbel_noise
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    """
+    Mask tokens by random top-k selection based on confidence
+    probs: [batch, L] confidence scores (higher = more confident)
+    mask_len: tensor shape [batch, 1] or scalar, number of tokens to keep masked (lowest-confidence)
+    returns: boolean mask [batch, L] True where token should REMAIN masked
+    """
+    if generator is not None:
+        noise = torch.randn(probs.shape, dtype=probs.dtype, device=probs.device, generator=generator)
+    else:
+        noise = torch.randn_like(probs)
+    # Add small noise to jitter confidences according to temperature
+    confidence = torch.log(probs + 1e-10) + temperature * noise  # higher = more confident
+    # We want to mask lowest-confidence tokens -> find cutoff
+    sorted_confidence, sorted_indices = torch.sort(confidence, dim=-1, descending=False)  # ascending
+    # mask_len may be float or tensor; ensure integer per-batch
+    if isinstance(mask_len, torch.Tensor):
+        mask_len_clamped = torch.clamp(mask_len, 0, probs.shape[-1] - 1)
+        mask_len_clamped = mask_len_clamped.long().squeeze(-1)  # shape [batch]
+    else:
+        mask_len_clamped = int(mask_len)
+    # Build boolean mask: True for tokens to KEEP masked (lowest confidence)
+    if isinstance(mask_len_clamped, torch.Tensor):
+        batch = probs.shape[0]
+        masking = torch.zeros_like(probs, dtype=torch.bool, device=probs.device)
+        for b in range(batch):
+            k = mask_len_clamped[b].item()
+            if k <= 0:
+                continue
+            low_idx = sorted_indices[b, :k]  # indices of lowest k confidences
+            masking[b, low_idx] = True
+    else:
+        # scalar k
+        k = mask_len_clamped
+        if k <= 0:
+            masking = torch.zeros_like(probs, dtype=torch.bool, device=probs.device)
+        else:
+            low_idx = sorted_indices[:, :k]
+            masking = torch.zeros_like(probs, dtype=torch.bool, device=probs.device)
+            batch = probs.shape[0]
+            for b in range(batch):
+                masking[b, low_idx[b]] = True
+    return masking
+def cosine_schedule(t):
+    """Cosine noise schedule"""
+    return torch.cos(t * math.pi / 2)
+def get_num_transfer_tokens(text_masked_indices, text_steps):
+    """
+    Calculate number of tokens to unmask at each step
+    Returns: [batch_size, text_steps]
+    """
+    batch_size = text_masked_indices.shape[0]
+    initial_masks = text_masked_indices.sum(dim=1)  # [batch_size]
+    num_transfer = torch.zeros(batch_size, text_steps, dtype=torch.long, device=text_masked_indices.device)
+    for b in range(batch_size):
+        total_masks = initial_masks[b].item()
+        remaining = total_masks
+        for step in range(text_steps):
+            ratio = (step + 1) / text_steps
+            target_remaining = int(total_masks * (1 - ratio))
+            tokens_to_unmask = max(0, remaining - target_remaining)
+            num_transfer[b, step] = tokens_to_unmask
+            remaining -= tokens_to_unmask
+    return num_transfer
+def generate_ti2ti(
+    model,
+    input_ids,
+    text_start,
+    text_end,
+    image_start,
+    seq_len,
+    newline_every,
+    text_steps=100,
+    text_gen_length=256,
+    text_block_length=64,
+    timesteps=100,
+    temperature=1.0,
+    text_temperature=0.7,
+    cfg_scale=0.0,
+    cfg_img=4.0,
+    uncon_text=None,
+    uncon_image=None,
+    tokenizer=None,
+    remasking='low_confidence',
+    noise_schedule=cosine_schedule,
+    generator=None,
+    text_vocab_size=126356,
+    codebook_size=8192,
+):
+    """
+    Generate text and image jointly with interleaved generation.
+    Text generation uses cond logits only (text_cfg assumed 0).
+    Image generation (at scheduled steps) uses two CFGs:
+       - uncond_text (if provided) : guidance that relates to text part
+       - uncond_image (if provided): guidance that relates to image part
+    """
+    device = input_ids.device
+    MASK_TOKEN = 126336
+    NEW_LINE = 126084
+    # Clone input for modification
+    combined_input_ids = input_ids.clone()
+    # Calculate total image region length (including newlines)
+    num_vq_tokens = seq_len
+    total_image_len = seq_len + seq_len // newline_every
+    image_end = image_start + total_image_len
+    print(f"Interleaved generation: {text_steps} total steps")
+    print(f"  - Text generation range: [{text_start}, {text_end})")
+    print(f"  - Image generation range: [{image_start}, {image_end}) (total {total_image_len} including newlines)")
+    print(f"  - VQ tokens: {num_vq_tokens}")
+    # Calculate number of tokens to unmask at each step for text
+    text_masked_indices = combined_input_ids[:, text_start:text_end] == MASK_TOKEN
+    num_transfer_tokens = get_num_transfer_tokens(text_masked_indices, text_steps)
+    # Schedule: when to perform image generation steps
+    image_generation_step_indices = torch.linspace(
+        text_steps // 4, text_steps - 1, timesteps
+    ).round().int().tolist()
+    print(f"  - Image generation at steps: {image_generation_step_indices[:5]}...{image_generation_step_indices[-5:]}")
+    # Build position mapping for image (excluding newlines)
+    image_position_mapping = []
+    for i in range(image_start, image_end):
+        if combined_input_ids[0, i] != NEW_LINE:
+            image_position_mapping.append(i)
+    assert len(image_position_mapping) == num_vq_tokens, f"Expected {num_vq_tokens} VQ tokens, got {len(image_position_mapping)}"
+    batch_size = combined_input_ids.shape[0]
+    # ========== Interleaved Generation Loop ==========
+    for step in tqdm(range(text_steps), desc="Interleaved generation"):
+        # ===== Forward pass: compute conditional logits once per step =====
+        with torch.no_grad():
+            cond_logits = model(combined_input_ids, infer=True, use_cache=False).logits  # [B, L, V]
+        # ===== Text Generation Step (no CFG for text; use cond_logits directly) =====
+        text_masked_indices = combined_input_ids[:, text_start:text_end] == MASK_TOKEN
+        if text_masked_indices.sum() > 0:
+            # Extract text logits from cond (no guidance)
+            text_logits = cond_logits[:, text_start:text_end, :]
+            # Apply temperature & gumbel
+            logits_with_noise = add_gumbel_noise(text_logits, temperature=text_temperature, generator=generator)
+            x0 = torch.argmax(logits_with_noise, dim=-1)  # [B, text_len]
+            # Compute confidence for remasking
+            if remasking == 'low_confidence':
+                p = F.softmax(text_logits.to(torch.float64), dim=-1)
+                x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)  # [B, text_len]
+            elif remasking == 'random':
+                if generator is not None:
+                    x0_p = torch.rand(x0.shape, dtype=x0.dtype, device=x0.device, generator=generator)
+                else:
+                    x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+            else:
+                raise NotImplementedError(remasking)
+            # keep already-unmasked tokens
+            x0 = torch.where(text_masked_indices, x0, combined_input_ids[:, text_start:text_end])
+            confidence = torch.where(text_masked_indices, x0_p, -np.inf)
+            # Select tokens to unmask based on confidence (top-k per batch element)
+            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+            for j in range(confidence.shape[0]):
+                k = num_transfer_tokens[j, step].item()
+                if k > 0:
+                    _, select_index = torch.topk(confidence[j], k=k)
+                    transfer_index[j, select_index] = True
+            # Unmask selected tokens into combined_input_ids
+            # Note: transfer_index is [B, text_len] boolean; place into full combined_input_ids
+            combined_input_ids[:, text_start:text_end][transfer_index] = x0[transfer_index]
+        # ===== Image Generation Step (scheduled) =====
+        if step in image_generation_step_indices:
+            # Build vq token list from current combined_input_ids (placeholder -1 for masked)
+            vq_tokens_list = []
+            for pos in image_position_mapping:
+                token = combined_input_ids[0, pos].item()
+                if token == MASK_TOKEN:
+                    vq_tokens_list.append(-1)
+                else:
+                    vq_token = token - text_vocab_size
+                    vq_token = max(0, min(vq_token, codebook_size - 1))
+                    vq_tokens_list.append(vq_token)
+            vq_tokens_tensor = torch.tensor(vq_tokens_list, device=device).unsqueeze(0)  # [1, num_vq_tokens]
+            unknown_map = vq_tokens_tensor == -1  # True where masked
+            # Extract cond_vq_logits from cond_logits (for VQ positions and vocab offset)
+            cond_image_logits_list = []
+            for pos in image_position_mapping:
+                cond_image_logits_list.append(cond_logits[:, pos:pos+1, text_vocab_size:text_vocab_size+codebook_size])
+            cond_vq_logits = torch.cat(cond_image_logits_list, dim=1)  # [B, num_vq_tokens, codebook_size]
+            # Prepare uncond logits only when needed (for image CFG)
+            # Create combined_uncond_text and combined_uncond_img by replacing prefix with uncon_text/uncon_image
+            if (cfg_scale > 0.0 and uncon_text is not None) or (cfg_img > 0.0 and uncon_image is not None):
+                # clone base input
+                # IMPORTANT: uncon_text/uncon_image expected to be on the same device or will be moved
+                # If uncon_text / uncon_image is None, create copies to avoid errors
+                if uncon_text is None:
+                    combined_uncond_text = combined_input_ids.clone()
+                else:
+                    combined_uncond_text = combined_input_ids.clone()
+                    prefix_len = uncon_text.shape[1]
+                    combined_uncond_text[:, :prefix_len] = uncon_text.to(device)
+                if uncon_image is None:
+                    combined_uncond_img = combined_input_ids.clone()
+                else:
+                    combined_uncond_img = combined_input_ids.clone()
+                    prefix_len_img = uncon_image.shape[1]
+                    combined_uncond_img[:, :prefix_len_img] = uncon_image.to(device)
+                # Forward for unconds
+                with torch.no_grad():
+                    uncond_text_logits_full = model(combined_uncond_text, infer=True, use_cache=False).logits
+                    uncond_img_logits_full = model(combined_uncond_img, infer=True, use_cache=False).logits
+                # Extract VQ ranges for each image position
+                uncond_text_vq_list = []
+                uncond_img_vq_list = []
+                for pos in image_position_mapping:
+                    uncond_text_vq_list.append(uncond_text_logits_full[:, pos:pos+1, text_vocab_size:text_vocab_size+codebook_size])
+                    uncond_img_vq_list.append(uncond_img_logits_full[:, pos:pos+1, text_vocab_size:text_vocab_size+codebook_size])
+                uncond_text_vq_logits = torch.cat(uncond_text_vq_list, dim=1)  # [B, num_vq_tokens, codebook_size]
+                uncond_img_vq_logits = torch.cat(uncond_img_vq_list, dim=1)    # [B, num_vq_tokens, codebook_size]
+            else:
+                # no unconds provided or scales are zero -> set uncond logits to zeros so (cond - 0) works if used
+                uncond_text_vq_logits = torch.zeros_like(cond_vq_logits)
+                uncond_img_vq_logits = torch.zeros_like(cond_vq_logits)
+            # Compose guided image logits:
+            # image_logits = cond_vq + cfg_scale * (cond_vq - uncond_text_vq) + cfg_img * (cond_vq - uncond_img_vq)
+            if cfg_scale == 0.0 and cfg_img == 0.0:
+                image_logits = cond_vq_logits
+            else:
+                image_logits = cond_vq_logits
+                if cfg_scale != 0.0:
+                    image_logits = image_logits + cfg_scale * (cond_vq_logits - uncond_text_vq_logits)
+                if cfg_img != 0.0:
+                    image_logits = image_logits + cfg_img * (cond_vq_logits - uncond_img_vq_logits)
+            # Sample from image_logits
+            probs = F.softmax(image_logits, dim=-1)  # [B, num_vq, codebook]
+            if temperature == 0:
+                sampled_ids = probs.argmax(dim=-1)
+            else:
+                # flatten batch*num_vq x vocab for multinomial
+                sampled = probs.reshape(-1, image_logits.size(-1))
+                if generator is not None:
+                    sampled_ids = torch.multinomial(sampled, 1, generator=generator)[:, 0].view(*image_logits.shape[:-1])
+                else:
+                    sampled_ids = torch.multinomial(sampled, 1)[:, 0].view(*image_logits.shape[:-1])
+            # Keep already-unmasked tokens unchanged
+            sampled_ids = torch.where(unknown_map, sampled_ids, vq_tokens_tensor)
+            # Clamp safety
+            sampled_ids = torch.clamp(sampled_ids, 0, codebook_size - 1)
+            # Confidence for sampled tokens
+            selected_probs = torch.gather(probs, -1, sampled_ids.long()[..., None]).squeeze(-1)  # [B, num_vq]
+            # If token was previously unmasked, give it very high confidence so we don't remask it
+            high_val = torch.finfo(selected_probs.dtype).max
+            selected_probs = torch.where(unknown_map, selected_probs, high_val)
+            # Masking ratio and mask_len calculation
+            ratio = 1.0 * (step + 1) / text_steps
+            mask_ratio = noise_schedule(torch.tensor(ratio, device=device))
+            # compute how many tokens to keep masked (lowest confidences)
+            unknown_counts = unknown_map.sum(dim=-1, keepdim=True)  # [B,1]
+            mask_len = (num_vq_tokens * mask_ratio).floor().unsqueeze(0).to(device)  # shape [1,] maybe
+            # clamp mask_len to [1, unknown_counts-1]
+            mask_len = torch.max(torch.tensor([1], device=device), torch.min(unknown_counts - 1, mask_len.to(device).long()))
+            # ensure shape [B,1]
+            if mask_len.ndim == 1:
+                mask_len = mask_len.unsqueeze(1)
+            # temperature decay for image sampling (optional)
+            img_temp = temperature * (1.0 - ratio)
+            # masking boolean: True where should remain masked
+            masking = mask_by_random_topk(mask_len, selected_probs, img_temp, generator=generator)
+            # final_vq_tokens: -1 means remain masked, else sampled id
+            final_vq_tokens = torch.where(masking, torch.tensor(-1, device=device), sampled_ids)
+            # Write back into combined_input_ids (convert vq id -> full vocab id by adding offset)
+            for idx, pos in enumerate(image_position_mapping):
+                v = final_vq_tokens[0, idx].item()
+                if v == -1:
+                    combined_input_ids[0, pos] = MASK_TOKEN
+                else:
+                    combined_input_ids[0, pos] = int(v + text_vocab_size)
+    # ===== Extract final results =====
+    # Extract text tokens
+    text_tokens = combined_input_ids[0, text_start:text_end].cpu().tolist()
+    text_tokens = [t for t in text_tokens if t != MASK_TOKEN]
+    generated_text = tokenizer.decode(text_tokens, skip_special_tokens=True) if tokenizer is not None else text_tokens
+    # Extract image VQ tokens
+    image_tokens = []
+    for pos in image_position_mapping:
+        token = combined_input_ids[0, pos].item()
+        if token != MASK_TOKEN:
+            vq_token = token - text_vocab_size
+            vq_token = max(0, min(vq_token, codebook_size - 1))
+            image_tokens.append(vq_token)
+        else:
+            # still masked -> sample randomly
+            image_tokens.append(int(torch.randint(0, codebook_size, (1,)).item()))
+    print(f"Interleaved generation complete.")
+    print(f"  - Generated text: {len(text_tokens)} tokens")
+    print(f"  - Generated image: {len(image_tokens)} VQ tokens (range [0, {codebook_size}))")
+    return image_tokens, generated_text

inference.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import time
+import math
+from PIL import Image
+import torch
+from transformers import AutoTokenizer
+from model import LLaDAForMultiModalGeneration
+from utils.generation_utils import setup_seed
+from utils.image_utils import (
+    preprocess_image, decode_vq_to_image, calculate_vq_params,
+    generate_crop_size_list, var_center_crop, add_break_line, encode_img_with_breaks,
+    encode_img_with_paint
+)
+from generators.parallel_generator import generate_ti2ti
+from utils.prompt_utils import generate_text_image_to_text_image_prompt
+SPECIAL_TOKENS = {
+    "mask_token": 126336,
+    "newline_token": 126084,
+    "image_token_offset": 126356,
+    "answer_start": 126354,
+    "answer_end": 126355,
+    "boi": 126349,
+    "eoi": 126350,
+    "uncondition": 126351
+}
+SYSTEM_PROMPT = (
+    "Generate an image applying the following editing instruction based on the original image."
+)
+def cosine_schedule(t):
+    return torch.cos(t * math.pi / 2)
+def main():
+    parser = argparse.ArgumentParser(description="Text+Image to Text+Image inference (TI2TI)")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Fine-tuned checkpoint path")
+    parser.add_argument("--prompt", type=str, required=True, help="Text prompt for editing")
+    parser.add_argument("--image_path", type=str, required=True, help="Input image path")
+    parser.add_argument("--height", type=int, default=512, help="Output image height")
+    parser.add_argument("--width", type=int, default=512, help="Output image width")
+    parser.add_argument("--timesteps", type=int, default=64, help="Number of diffusion timesteps")
+    parser.add_argument("--text_steps", type=int, default=256, help="Number of text generation steps")
+    parser.add_argument("--text_gen_length", type=int, default=256, help="Maximum text generation length")
+    parser.add_argument("--text_block_length", type=int, default=32, help="Text generation block length")
+    parser.add_argument("--cfg_scale", type=float, default=2.5, help="CFG scale for text")
+    parser.add_argument("--cfg_img", type=float, default=4.0, help="CFG scale for image")
+    parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature")
+    parser.add_argument("--text_temperature", type=float, default=0.7, help="Text generation temperature")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument("--vae_ckpt", type=str, required=True, help="VAE checkpoint path")
+    parser.add_argument("--output_dir", type=str, default="results_ti2ti", help="Output directory")
+    parser.add_argument("--remasking", type=str, default="low_confidence",
+                        choices=["low_confidence", "random"],
+                        help="Remasking strategy")
+    parser.add_argument("--painting_mode", type=str, default=None, help="If set, use painting-mode encoding")
+    parser.add_argument("--mask_h_ratio", type=float, default=0.5, help="mask height ratio for painting mode")
+    parser.add_argument("--mask_w_ratio", type=float, default=0.5, help="mask width ratio for painting mode")
+    parser.add_argument("--debug_tokens", action="store_true", help="Print token debug info to verify sequence layout")
+    args = parser.parse_args()
+    MASK = SPECIAL_TOKENS["mask_token"]
+    NEW_LINE = SPECIAL_TOKENS["newline_token"]
+    BOA = SPECIAL_TOKENS["answer_start"]
+    EOA = SPECIAL_TOKENS["answer_end"]
+    BOI = SPECIAL_TOKENS["boi"]
+    EOI = SPECIAL_TOKENS["eoi"]
+    if args.seed != 0:
+        setup_seed(args.seed)
+    os.makedirs(args.output_dir, exist_ok=True)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    print(f"Loading model from {args.checkpoint}...")
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint, trust_remote_code=True)
+    model = LLaDAForMultiModalGeneration.from_pretrained(
+        args.checkpoint, torch_dtype=torch.bfloat16, device_map="auto",
+    )
+    config = model.config
+    text_vocab_size = getattr(config, 'text_vocab_size', 126356)
+    codebook_size = getattr(config, 'codebook_size', 8192)
+    print(f"Vocabulary config: text_vocab_size={text_vocab_size}, codebook_size={codebook_size}")
+    print(f"Loading VQ-VAE from {args.vae_ckpt}...")
+    from diffusers import VQModel
+    vqvae = VQModel.from_pretrained(args.vae_ckpt, subfolder="vqvae").to(device)
+    vae_scale = 2 ** (len(vqvae.config.block_out_channels) - 1)
+    prompt_text = args.prompt
+    input_image_path = args.image_path
+    print(f"\n{'='*80}")
+    print(f"TI2TI Generation")
+    print(f"{'='*80}")
+    print(f"Input image: {input_image_path}")
+    print(f"Prompt: {prompt_text}")
+    print(f"Output size: {args.height}x{args.width}")
+    print(f"{'='*80}\n")
+    input_prompt, uncon_text = generate_text_image_to_text_image_prompt(
+        prompt_text, SYSTEM_PROMPT
+    )
+    print("Conditioning prompt:\n", input_prompt)
+    if args.debug_tokens:
+        print("Unconditional text prompt (first 200 chars):", uncon_text[:200])
+    prompt_ids = tokenizer(input_prompt)["input_ids"]
+    uncon_text_ids = tokenizer(uncon_text)["input_ids"]
+    img = Image.open(input_image_path).convert("RGB")
+    crop_size_list = generate_crop_size_list((512 // 32) ** 2, 32)
+    img = var_center_crop(img, crop_size_list=crop_size_list)
+    input_image_width, input_image_height = img.size
+    print("Encoding input image for conditioning...")
+    input_img_token = encode_img_with_breaks(img, vqvae)
+    con_input_list = prompt_ids[:-1] + input_img_token + prompt_ids[-1:]
+    uncon_input_text = uncon_text_ids[:-1] + input_img_token + uncon_text_ids[-1:]
+    uncon_input_image = prompt_ids
+    output_image_height = args.height
+    output_image_width = args.width
+    seq_len, newline_every, token_grid_height, token_grid_width = calculate_vq_params(
+        output_image_height, output_image_width, vae_scale
+    )
+    text_mask_tokens = [MASK] * args.text_gen_length
+    if args.painting_mode:
+        img_mask_token, img_vis = encode_img_with_paint(
+            img, vqvae=vqvae, mask_h_ratio=args.mask_h_ratio, mask_w_ratio=args.mask_w_ratio, mask_mode=args.painting_mode
+        )
+    else:
+        img_mask_token = add_break_line([MASK] * seq_len, token_grid_height, token_grid_width, new_number=NEW_LINE)
+    end_token_ids = tokenizer("</answer>", add_special_tokens=False).input_ids
+    pred_token = [BOA] + [BOI] + img_mask_token + [EOI] + text_mask_tokens + end_token_ids
+    code_start = len(con_input_list)
+    image_start = len(con_input_list) + 2
+    image_end = image_start + len(img_mask_token)
+    text_start = image_end + 1
+    text_end = text_start + args.text_gen_length
+    full_input_ids = con_input_list + pred_token
+    con_input = torch.tensor(full_input_ids, device=device).unsqueeze(0)
+    uncon_input_text = torch.tensor(uncon_input_text, device=device).unsqueeze(0)
+    uncon_input_image = torch.tensor(uncon_input_image, device=device).unsqueeze(0)
+    start_time = time.time()
+    if args.seed != 0:
+        generator = torch.Generator(device=device).manual_seed(args.seed)
+    else:
+        generator = None
+    output_tokens, generated_text = generate_ti2ti(
+        model=model,
+        input_ids=con_input,
+        text_start=text_start,
+        text_end=text_end,
+        image_start=image_start,
+        seq_len=seq_len,
+        newline_every=newline_every,
+        text_steps=args.text_steps,
+        text_gen_length=args.text_gen_length,
+        text_block_length=args.text_block_length,
+        timesteps=args.timesteps,
+        temperature=args.temperature,
+        text_temperature=args.text_temperature,
+        cfg_scale=args.cfg_scale,
+        cfg_img=args.cfg_img,
+        uncon_text=uncon_input_text,
+        uncon_image=uncon_input_image,
+        tokenizer=tokenizer,
+        remasking=args.remasking,
+        noise_schedule=cosine_schedule,
+        generator=generator,
+        text_vocab_size=text_vocab_size,
+        codebook_size=codebook_size,
+    )
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print(f"\n{'='*80}")
+    print(f"Generated thinking/text output:")
+    print(f"{'='*80}")
+    print(generated_text)
+    print(f"{'='*80}\n")
+    print(f"Converting {len(output_tokens)} VQ tokens to tensor...")
+    output_tokens_tensor = torch.tensor(output_tokens, dtype=torch.long, device=device).unsqueeze(0)
+    print(f"VQ tokens range: [{min(output_tokens)}, {max(output_tokens)}]")
+    words = (prompt_text or "").split()
+    filename_words = words[:10] if len(words) > 10 else words
+    filename = "_".join(filename_words)
+    filename = "".join(c for c in filename if c.isalnum() or c in ('_', '-'))
+    filename = f"{filename}_{output_image_height}x{output_image_width}_t{args.timesteps}_cfg{args.cfg_scale}_ti2ti.png"
+    save_path = os.path.join(args.output_dir, filename)
+    print("Decoding image...")
+    out_img = decode_vq_to_image(
+        output_tokens_tensor,
+        save_path,
+        vae_ckpt=args.vae_ckpt,
+        image_height=output_image_height,
+        image_width=output_image_width,
+        vqvae=vqvae
+    )
+    w1, h1 = img.size
+    w2, h2 = out_img.size
+    canvas = Image.new("RGB", (w1 + w2, max(h1, h2)), "white")
+    canvas.paste(img, (0, 0))
+    canvas.paste(out_img, (w1, 0))
+    concat_path = save_path.replace(".png", "_concat.png")
+    canvas.save(concat_path)
+    text_path = save_path.replace(".png", "_thinking.txt")
+    with open(text_path, "w", encoding="utf-8") as f:
+        f.write(f"{generated_text}\n")
+    print(f"\n[✓] Image saved to: {concat_path}")
+    print(f"[✓] Text saved to: {text_path}")
+    print(f"[✓] Total time: {elapsed_time:.2f}s")
+if __name__ == '__main__':
+    main()

model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_xllmx_dimoo import LLaDAForMultiModalGeneration

model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (271 Bytes). View file

model/__pycache__/configuration_llada.cpython-311.pyc ADDED Viewed

Binary file (9.27 kB). View file

model/__pycache__/modeling_llada.cpython-311.pyc ADDED Viewed

Binary file (78.2 kB). View file

model/__pycache__/modeling_xllmx_dimoo.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

model/configuration_llada.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+LLaDA configuration
+"""
+from transformers import AutoConfig, PretrainedConfig
+from enum import Enum
+from os import PathLike
+from typing import Union
+from dataclasses import asdict, dataclass, field
+from glob import glob
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
+__all__ = [
+    "ActivationType",
+    "ActivationCheckpointingStrategy",
+    "BlockType",
+    "LayerNormType",
+    "InitFnType",
+    "ModelConfig",
+]
+PathOrStr = Union[str, PathLike]
+class StrEnum(str, Enum):
+    """
+    This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
+    We include this here for compatibility with older version of Python.
+    """
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"'{str(self)}'"
+class LayerNormType(StrEnum):
+    default = "default"
+    """
+    The default LayerNorm implementation, equivalent to PyTorch's built-in version.
+    """
+    low_precision = "low_precision"
+    """
+    A low-precision version of the default LayerNorm.
+    """
+    rms = "rms"
+    """
+    An RMSNorm implementation. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+    gemma_rms = "gemma_rms"
+    """
+    An RMSNorm implementation by gemmma. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+    amd_compatible = "amd_compatible"
+    """
+    LayerNorm implemented manually to work around an issue with ROCm.
+    """
+class ActivationType(StrEnum):
+    gelu = "gelu"
+    relu = "relu"
+    silu = "silu"
+    swiglu = "swiglu"
+class BlockType(StrEnum):
+    sequential = "sequential"
+    parallel = "parallel"
+    llama = "llama"
+    """
+    A block similar to the sequential block with slightly different
+    implementations of operations like attention to imitate the behavior of Llama.
+    """
+class InitFnType(StrEnum):
+    mitchell = "mitchell"
+    """
+    The strategy suggested to us by Mitchell Wortsman from UW.
+    This uses a truncated normal distribution with an adaptive standard deviation that depends
+    on the size of the weights as well as the depth of the layer.
+    """
+    normal = "normal"
+    """
+    All weights are initialized from the same normal distribution.
+    """
+    kaiming_normal = "kaiming_normal"
+    """
+    All weights are initialized with the Kaiming method from a normal distribution.
+    Note this currently won't work with FSDP.
+    """
+    fan_in = "fan_in"
+    """
+    "Fan-in variance scaling", i.e. normal with a standard deviation of ``1/sqrt(d_in)`` where ``d_in``
+    is the input dimensionality of the kernel.
+    """
+    full_megatron = "full_megatron"
+    """
+    This is what metaseq calls "full megatron init". It is the init used for Llama 2.
+    """
+@dataclass
+class ModelConfig():
+    """
+    LLaDA (model) configuration.
+    """
+    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
+    d_model: int = 768
+    """
+    The hidden size of the model.
+    """
+    n_heads: int = 12
+    """
+    The number of self-attention heads.
+    """
+    n_kv_heads: Optional[int] = None
+    """
+    The number of heads to use for keys and values. Defaults to `n_heads`.
+    Set this to ``None`` or ``n_heads`` for normal multi-head attention.
+    Set this to 1 for multi-query attention.
+    Set it to some in-between value for Llama2-style grouped query attention.
+    """
+    n_layers: int = 12
+    """
+    The number of layers/blocks.
+    """
+    mlp_ratio: int = 4
+    """
+    The ratio of the inner MLP dimensionality to ``d_model``.
+    This is only used when ``mlp_hidden_size`` is not set.
+    """
+    mlp_hidden_size: Optional[int] = None
+    """
+    Set the exact hidden size for the MLP. Otherwise the inner MLP hidden size will be set to `mlp_ratio * d_model`.
+    """
+    activation_type: ActivationType = ActivationType.swiglu
+    """
+    The activation function to use within the MLP layers.
+    """
+    block_type: BlockType = BlockType.sequential
+    """
+    The transformer block implementation.
+    """
+    block_group_size: int = 1
+    """
+    The number of blocks to group together into a single parent block.
+    This has no affect on the number of parameters in the model and is only used to wrap groups
+    of blocks together with a single FSDP wrapper during training.
+    """
+    alibi: bool = False
+    """
+    If ``True``, use ALiBi embeddings. Mutually exclusive with ``rope``.
+    """
+    alibi_bias_max: float = 8.0
+    """
+    Maximum absolute value of ALiBi bias.
+    """
+    rope: bool = False
+    """
+    Use rotary positional embeddings (RoPE). Mutually exclusive with ``alibi``.
+    """
+    rope_full_precision: bool = True
+    """
+    If ``True``, apply RoPE embeddings at full precision regardless of the input type. Otherwise,
+    apply RoPE at the precision of the input.
+    """
+    flash_attention: bool = False
+    """
+    If ``True``, use ``FlashAttention``.
+    """
+    attention_dropout: float = 0.1
+    """
+    The dropout probability within the attention modules.
+    """
+    multi_query_attention: Optional[bool] = None
+    """
+    Use the Multi-Query formulation of attention used in PaLM. This reduces the number of parameters
+    and is more efficient during inference.
+    """
+    attention_layer_norm: bool = False
+    """
+    Apply layer norm to the keys and queries within the attention mechanism.
+    This can help stabilize training.
+    """
+    residual_dropout: float = 0.1
+    """
+    The dropout probability for the MLP and attention output within each block.
+    """
+    embedding_dropout: float = 0.1
+    """
+    The dropout probability for embeddings.
+    """
+    input_emb_norm: bool = False
+    """
+    An input hidden_states norm implementation by gemmma.
+    """
+    layer_norm_type: LayerNormType = LayerNormType.default
+    """
+    The layernorm implementation to use.
+    """
+    layer_norm_with_affine: bool = True
+    """
+    Whether to include bias and weight parameters for the layer norms.
+    This only affects layer norms that are immediately followed by a linear layer in the forward pass,
+    so everything except QK-norms. To turn off affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
+    to ``False``.
+    """
+    rms_norm_eps: float = 1e-05
+    """
+    The rms layernorm eps param.
+    """
+    attention_layer_norm_with_affine: bool = True
+    """
+    Toggle affine transform for the QK norms.
+    """
+    max_sequence_length: int = 1024
+    """
+    The maximum input sequence length supported by the model.
+    """
+    rope_theta: float = 10000.0
+    """
+    The rope base param.
+    """
+    include_qkv_bias: Optional[bool] = False
+    """
+    Whether or not to include bias parameters in qkv linear layers.
+    """
+    include_bias: bool = False
+    """
+    Whether or not to include bias parameters in linear layers.
+    In PaLM, they got rid of all bias terms because they found that large
+    models tend to have near 0 bias terms anyway.
+    """
+    bias_for_layer_norm: Optional[bool] = None
+    """
+    Whether or not to include bias parameters in layer norm.
+    This is separate from the include_bias parameter, because of a ROCm crash when biases are disabled in
+    layer norm.
+    When this is None (the default), it inherits the setting from include_bias.
+    """
+    scale_logits: bool = False
+    """
+    If ``True``, scale the output logits by ``1 / sqrt(d_model)``.
+    """
+    vocab_size: int = 50257
+    """
+    Vocabulary size of the model.
+    """
+    embedding_size: Optional[int] = 50304
+    """
+    The number of embeddings, i.e. the number of tokens. If set to ``None`` it will default
+    to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, setting this to the
+    next multiple of 128 that's greater than ``vocab_size`` can improve throughput
+    substantially.
+    """
+    weight_tying: bool = True
+    """
+    Whether to tie output linear weights to the input embedding.
+    """
+    eos_token_id: int = 50256
+    """
+    The ID of the end-of-sentence special token.
+    """
+    pad_token_id: int = 50256
+    """
+    The ID of the token to use for padding. Defaults to the ID of the EOS token.
+    """
+    mask_token_id: Optional[int] = 50256
+    """
+    The ID of the token to use for mask token. Defaults to the ID of the EOS token.
+    """
+    init_device: Optional[str] = None
+    """
+    The torch device to use when initializing the model parameters, e.g. "cpu", "cuda:0", "meta".
+    """
+    init_fn: InitFnType = InitFnType.normal
+    """
+    The weight initialization strategy.
+    """
+    init_std: float = 0.02
+    """
+    The standard deviation to use when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal".
+    """
+    init_cutoff_factor: Optional[float] = None
+    """
+    A positive factor used to scale the cutoff values when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal". Setting this to None means values are not cutoff.
+    """
+    precision: Optional[str] = None
+    """
+    Precision used to train/evaluate with. You shouldn't set this directly.
+    See :data:`TrainConfig.precision` instead.
+    """
+    @property
+    def effective_n_kv_heads(self) -> int:
+        if self.n_kv_heads is None:
+            if self.multi_query_attention is True:
+                return 1
+            else:
+                return self.n_heads
+        else:
+            if self.multi_query_attention is None:
+                return self.n_kv_heads
+            if self.multi_query_attention:
+                n_kv_heads_should_be = 1
+            else:
+                n_kv_heads_should_be = self.n_heads
+            if self.n_kv_heads == n_kv_heads_should_be:
+                return n_kv_heads_should_be
+            else:
+                raise Exception(
+                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
+                )
+class ActivationCheckpointingStrategy(StrEnum):
+    whole_layer = "whole_layer"
+    """
+    Checkpoint every transformer layer.
+    """
+    one_in_two = "one_in_two"
+    """
+    Checkpoint one in two transformer layers.
+    """
+    one_in_three = "one_in_three"
+    """
+    Checkpoint one in three transformer layers.
+    """
+    one_in_four = "one_in_four"
+    """
+    Checkpoint one in four transformer layers.
+    """
+    two_in_three = "two_in_three"
+    """
+    Checkpoint two out of every three transformer layers.
+    """
+    three_in_four = "three_in_four"
+    """
+    Checkpoint three out of four of every transformer layers.
+    """
+    four_in_five = "four_in_five"
+    """
+    Checkpoint four out of five of every transformer layers.
+    """
+    nine_in_ten = "nine_in_ten"
+    """
+    Checkpoint nine out of ten of every transformer layers.
+    """
+    fine_grained = "fine_grained"
+    """
+    Focus checkpointing on where it is cheap to recompute and saves most memory.
+    """
+class LLaDAConfig(PretrainedConfig):
+    model_type = "llada"
+    keys_to_ignore_at_inference = ["past_key_values"]  # TODO: confirm
+    def __init__(self, use_cache: bool = False, **kwargs):
+        model_config = ModelConfig()
+        all_kwargs = model_config.__dict__
+        all_kwargs.update(kwargs)
+        all_kwargs.update({"use_cache": use_cache})
+        all_kwargs.update(
+            {
+                "architectures": all_kwargs.get("architectures", ["LLaDAModelLM"])
+            }
+        )
+        super().__init__(**all_kwargs)
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
+    @property
+    def hidden_size(self):
+        return self.d_model
+# Register the config class so that it is available for transformer pipelines, auto-loading etc.
+AutoConfig.register("llada", LLaDAConfig)

model/modeling_llada.py ADDED Viewed

	@@ -0,0 +1,1567 @@

+from __future__ import annotations
+import logging
+import math
+import sys
+from abc import abstractmethod
+from collections import defaultdict
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    cast,
+)
+from dataclasses import fields
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.backends.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.auto import AutoModel
+from transformers.cache_utils import Cache
+from .configuration_llada import (
+    LLaDAConfig,
+    StrEnum,
+    InitFnType,
+    ActivationType,
+    BlockType,
+    LayerNormType,
+    ModelConfig,
+    ActivationCheckpointingStrategy,
+)
+if sys.version_info.minor > 8:
+    from collections.abc import MutableMapping
+elif sys.version_info.minor == 8:
+    from typing import MutableMapping
+else:
+    raise SystemExit("This script supports Python 3.8 or higher")
+__all__ = [
+    "LayerNormBase",
+    "LayerNorm",
+    "RMSLayerNorm",
+    "GemmaRMSLayerNorm",
+    "RotaryEmbedding",
+    "Activation",
+    "GELU",
+    "ReLU",
+    "SwiGLU",
+    "LLaDABlock",
+    "LLaDASequentialBlock",
+    "LLaDAModel",
+    "LLaDAOutput",
+    "LLaDAGenerateOutput",
+]
+log = logging.getLogger(__name__)
+class ModuleType(StrEnum):
+    in_module = "in"
+    out_module = "out"
+    emb = "emb"
+    final_out = "final_out"
+def init_weights(
+    config: ModelConfig,
+    module: Union[nn.Linear, nn.Embedding],
+    d: Optional[int] = None,
+    layer_id: Optional[int] = None,
+    std_factor: float = 1.0,
+    type_of_module: Optional[ModuleType] = None,
+) -> None:
+    """
+    Initialize weights of a linear or embedding module.
+    :param config: The model config.
+    :param module: The linear or embedding submodule to initialize.
+    :param d: The effective input dimensionality of the weights. This could be smaller than the actual dimensions
+        for fused layers.
+    :param layer_id: When set, the standard deviation for the "mitchell" method will be adjusted by
+        ``1 / sqrt(2 * (layer_id + 1))``.
+    """
+    d = d if d is not None else config.d_model
+    if config.init_fn == InitFnType.normal:
+        std = config.init_std * std_factor
+        if config.init_cutoff_factor is not None:
+            cutoff_value = config.init_cutoff_factor * std
+            nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+        else:
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif config.init_fn == InitFnType.mitchell:
+        std = std_factor / math.sqrt(d)
+        if layer_id is not None:
+            std = std / math.sqrt(2 * (layer_id + 1))
+        nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-3 * std, b=3 * std)
+    elif config.init_fn == InitFnType.kaiming_normal:
+        nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
+    elif config.init_fn == InitFnType.fan_in:
+        std = std_factor / math.sqrt(d)
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif config.init_fn == InitFnType.full_megatron:
+        if type_of_module is None:
+            raise RuntimeError(f"When using the {InitFnType.full_megatron} init, every module must have a type.")
+        cutoff_factor = config.init_cutoff_factor
+        if cutoff_factor is None:
+            cutoff_factor = 3
+        if type_of_module == ModuleType.in_module:
+            # for att_proj (same as QKV), ff_proj
+            std = config.init_std
+        elif type_of_module == ModuleType.out_module:
+            # for attn_out, ff_out
+            std = config.init_std / math.sqrt(2.0 * config.n_layers)
+        elif type_of_module == ModuleType.emb:
+            # positional embeddings (wpe)
+            # token embeddings (wte)
+            std = config.init_std
+        elif type_of_module == ModuleType.final_out:
+            # final output (ff_out)
+            std = config.d_model**-0.5
+        else:
+            raise RuntimeError(f"Unknown module type '{type_of_module}'")
+        nn.init.trunc_normal_(
+            module.weight,
+            mean=0.0,
+            std=std,
+            a=-cutoff_factor * std,
+            b=cutoff_factor * std,
+        )
+    else:
+        raise NotImplementedError(config.init_fn)
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+        if config.init_fn == InitFnType.normal and getattr(module, "_is_residual", False):
+            with torch.no_grad():
+                module.weight.div_(math.sqrt(2 * config.n_layers))
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+def activation_checkpoint_function(cfg: ModelConfig):
+    preserve_rng_state = (
+        (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and (cfg.residual_dropout == 0.0)
+    )
+    from torch.utils.checkpoint import checkpoint
+    return partial(
+        checkpoint,
+        preserve_rng_state=preserve_rng_state,
+        use_reentrant=False,
+    )
+class BufferCache(dict, MutableMapping[str, torch.Tensor]):
+    """
+    Cache for attention biases and other things that would normally be stored as buffers.
+    We avoid using buffers because we've run into various issues doing so with FSDP.
+    In general it appears the way FSDP handles buffers is not well-defined.
+    It doesn't shard them but apparently it does synchronize them across processes, which we want to avoid
+    since (A) it isn't necessary, and (B) we sometimes have `-inf` in these biases which might get turned into
+    NaNs when they're synchronized due to casting or some other issue.
+    """
+def _non_meta_init_device(config: ModelConfig) -> torch.device:
+    if config.init_device is not None and config.init_device != "meta":
+        return torch.device(config.init_device)
+    else:
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class Dropout(nn.Dropout):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.p == 0.0:
+            return input
+        else:
+            return F.dropout(input, self.p, self.training, self.inplace)
+class LayerNormBase(nn.Module):
+    def __init__(
+        self,
+        config: ModelConfig,
+        *,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+        eps: float = 1e-05,
+    ):
+        super().__init__()
+        self.config = config
+        self.eps = eps
+        self.normalized_shape = (size or config.d_model,)
+        if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine):
+            self.weight = nn.Parameter(torch.ones(self.normalized_shape, device=config.init_device))
+            use_bias = self.config.bias_for_layer_norm
+            if use_bias is None:
+                use_bias = self.config.include_bias
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(self.normalized_shape, device=config.init_device))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("bias", None)
+            self.register_parameter("weight", None)
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig, size: Optional[int] = None, **kwargs) -> LayerNormBase:
+        if config.layer_norm_type == LayerNormType.default:
+            return LayerNorm(config, size=size, low_precision=False, **kwargs)
+        elif config.layer_norm_type == LayerNormType.low_precision:
+            return LayerNorm(config, size=size, low_precision=True, **kwargs)
+        elif config.layer_norm_type == LayerNormType.rms:
+            return RMSLayerNorm(config, size=size, **kwargs)
+        elif config.layer_norm_type == LayerNormType.gemma_rms:
+            return GemmaRMSLayerNorm(config, size=size, **kwargs)
+        else:
+            raise NotImplementedError(f"Unknown LayerNorm type: '{config.layer_norm_type}'")
+    def _cast_if_autocast_enabled(self, tensor: torch.Tensor, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if tensor.device.type == "cuda" and torch.is_autocast_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_gpu_dtype())
+        elif tensor.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype())
+        else:
+            return tensor
+    def reset_parameters(self):
+        if self.weight is not None:
+            torch.nn.init.ones_(self.weight)  # type: ignore
+        if self.bias is not None:
+            torch.nn.init.zeros_(self.bias)  # type: ignore
+class LayerNorm(LayerNormBase):
+    """
+    The default :class:`LayerNorm` implementation which can optionally run in low precision.
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        low_precision: bool = False,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-05,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
+        self.low_precision = low_precision
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.low_precision:
+            module_device = x.device
+            downcast_x = self._cast_if_autocast_enabled(x)
+            downcast_weight = (
+                self._cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+            )
+            downcast_bias = self._cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+            with torch.autocast(enabled=False, device_type=module_device.type):
+                return F.layer_norm(
+                    downcast_x, self.normalized_shape, weight=downcast_weight, bias=downcast_bias, eps=self.eps
+                )
+        else:
+            return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class RMSLayerNorm(LayerNormBase):
+    """
+    RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return self.weight * x + self.bias
+            else:
+                return self.weight * x
+        else:
+            return x
+class GemmaRMSLayerNorm(LayerNormBase):
+    """
+    Gemma RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return x * (1 + self.weight) + self.bias
+            else:
+                return x * (1 + self.weight)
+        else:
+            return x
+class RotaryEmbedding(nn.Module):
+    """
+    [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
+    """
+    def __init__(self, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.config = config
+        self.__cache = cache
+        # Warm up cache.
+        self.rope_theta = config.rope_theta
+        self.get_rotary_embedding(config.max_sequence_length, _non_meta_init_device(config))
+    def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (
+            (pos_sin := self.__cache.get("rope_pos_sin")) is not None
+            and (pos_cos := self.__cache.get("rope_pos_cos")) is not None
+            and pos_sin.shape[-2] >= seq_len
+            and pos_cos.shape[-2] >= seq_len
+        ):
+            if pos_sin.device != device:
+                pos_sin = pos_sin.to(device)
+                self.__cache["rope_pos_sin"] = pos_sin
+            if pos_cos.device != device:
+                pos_cos = pos_cos.to(device)
+                self.__cache["rope_pos_cos"] = pos_cos
+            return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
+        with torch.autocast(device.type, enabled=False):
+            dim = self.config.d_model // self.config.n_heads
+            inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
+            seq = torch.arange(seq_len, device=device, dtype=torch.float)
+            freqs = einsum("i , j -> i j", seq, inv_freq)
+            positions = torch.cat((freqs, freqs), dim=-1)
+            pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+        self.__cache["rope_pos_sin"] = pos_sin
+        self.__cache["rope_pos_cos"] = pos_cos
+        return pos_sin, pos_cos
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, 2, hs // 2)
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
+    def forward(self, q: torch.Tensor, k: torch.Tensor, q_mask=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.config.rope_full_precision:
+            q_, k_ = q.float(), k.float()
+        else:
+            q_, k_ = q, k
+        with torch.autocast(q.device.type, enabled=False):
+            query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
+            pos_sin, pos_cos = self.get_rotary_embedding(key_len, q_.device)
+            pos_sin = pos_sin.type_as(q_)
+            pos_cos = pos_cos.type_as(q_)
+            if q_mask is None:
+                q_ = self.apply_rotary_pos_emb(
+                    pos_sin[:, :, key_len - query_len : key_len, :],
+                    pos_cos[:, :, key_len - query_len : key_len, :],
+                    q_,
+                )
+            else:
+                q_ = self.apply_rotary_pos_emb(
+                    pos_sin[:, :, q_mask, :],
+                    pos_cos[:, :, q_mask, :],
+                    q_,
+                )
+            k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
+        return q_.type_as(q), k_.type_as(k)
+class Activation(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def output_multiplier(self) -> float:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig) -> Activation:
+        if config.activation_type == ActivationType.gelu:
+            return cast(Activation, GELU(approximate="none"))
+        elif config.activation_type == ActivationType.relu:
+            return cast(Activation, ReLU(inplace=False))
+        elif config.activation_type == ActivationType.silu:
+            return cast(Activation, SiLU(inplace=False))
+        elif config.activation_type == ActivationType.swiglu:
+            return SwiGLU(config)
+        else:
+            raise NotImplementedError(f"Unknown activation: '{config.activation_type}'")
+class GELU(nn.GELU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class ReLU(nn.ReLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SiLU(nn.SiLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SwiGLU(Activation):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.device) -> torch.Tensor:
+    if (causal_bias := cache.get("causal_attention_bias")) is not None and causal_bias.shape[-1] >= seq_len:
+        if causal_bias.device != device:
+            causal_bias = causal_bias.to(device)
+            cache["causal_attention_bias"] = causal_bias
+        return causal_bias
+    with torch.autocast(device.type, enabled=False):
+        causal_bias = causal_attention_bias(seq_len, device)
+    cache["causal_attention_bias"] = causal_bias
+    return causal_bias
+def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device) -> torch.FloatTensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len)
+    # shape: (1, 1, seq_len, seq_len)
+    alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1)
+    alibi_bias.abs_().mul_(-1)
+    # shape: (n_heads,)
+    m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device)
+    m.mul_(config.alibi_bias_max / config.n_heads)
+    # shape: (1, n_heads, seq_len, seq_len)
+    return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
+class LLaDABlock(nn.Module):
+    """
+    A base class for transformer block implementations.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.__cache = cache
+        assert config.d_model % config.n_heads == 0
+        self._activation_checkpoint_fn = None
+        # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
+        # Layer norms.
+        self.k_norm: Optional[LayerNormBase] = None
+        self.q_norm: Optional[LayerNormBase] = None
+        if config.attention_layer_norm:
+            self.k_norm = LayerNormBase.build(
+                config,
+                size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
+                elementwise_affine=config.attention_layer_norm_with_affine,
+            )
+            self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
+        # Activation function.
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        # Attention output projection.
+        self.attn_out = nn.Linear(
+            config.d_model, config.d_model, bias=config.include_bias, device=config.init_device
+        )
+        # Feed-forward output projection.
+        self.ff_out = nn.Linear(
+            int(self.act.output_multiplier * self.hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            device=config.init_device,
+        )
+        self.ff_out._is_residual = True  # type: ignore
+        # Rotary embeddings.
+        if self.config.rope:
+            self.rotary_emb = RotaryEmbedding(config, self.__cache)
+        self.flash_attn_func = None
+        if config.flash_attention:
+            try:
+                from flash_attn import flash_attn_func  # type: ignore
+                self.flash_attn_func = flash_attn_func
+            except ModuleNotFoundError:
+                pass
+        self.use_cache = False
+        self.init_cache()
+    def init_cache(self):
+        self.cache = {
+            'k': {}, 'v': {}, 'out': {}
+        }
+    def caching(self, enable: bool = True):
+        self.use_cache = enable
+        self.init_cache()
+    def reset_parameters(self):
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        init_weights(
+            self.config,
+            self.attn_out,
+            d=self.config.d_model,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+        init_weights(
+            self.config,
+            self.ff_out,
+            d=self.ff_out.in_features,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        if strategy == ActivationCheckpointingStrategy.fine_grained:
+            self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+        else:
+            self._activation_checkpoint_fn = None
+    @classmethod
+    def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
+        target_dtype = input_dtype
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if bias.device.type == "cuda" and torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            target_dtype = torch.get_autocast_cpu_dtype()
+        if bias.dtype != target_dtype:
+            bias = bias.to(target_dtype)
+            ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
+        return bias
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Computes scaled dot product attention on query, key and value tensors, using an optional
+        attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
+        """
+        if self.flash_attn_func is not None and attn_mask is None:
+            r = self.flash_attn_func(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=False
+            )
+            return r.transpose(1, 2)
+        else:
+            # torch's sdpa doesn't support GQA, so we're doing this
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            # Modify: MDM set causal to False, and with no attn_mask.
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=dropout_p,
+                is_causal=False,
+            )
+    def attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        to_compute_mask = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, -1, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, -1, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, -1, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        # present = (k, v) if use_cache else None
+        # query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if self.config.rope:
+            to_compute_index = to_compute_mask.nonzero(as_tuple=True)[1] if self.use_cache and to_compute_mask is not None else None
+            q, k = self.rotary_emb(q, k, q_mask=to_compute_index)
+        if attention_bias is not None:
+            # Resize and cast attention bias.
+            # The current dtype of the attention bias might not match the dtype that the SDP attn function will
+            # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
+            # as down-casting the attention bias to the autocast precision will result in -infs, which will
+            # cause the SDP attn function to produce NaNs.
+            attention_bias = self._cast_attn_bias(
+                # attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype
+                attention_bias, dtype
+            )
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            is_causal=False,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection.
+        return self.attn_out(att), None
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> LLaDABlock:
+        if config.block_type == BlockType.sequential:
+            return LLaDASequentialBlock(layer_id, config, cache)
+        elif config.block_type == BlockType.llama:
+            return LLaDALlamaBlock(layer_id, config, cache)
+        else:
+            raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
+class LLaDASequentialBlock(LLaDABlock):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.effective_n_kv_heads * head_dim,
+            config.effective_n_kv_heads * head_dim,
+        )
+        self.att_proj = nn.Linear(
+            config.d_model, sum(self.fused_dims), bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(
+            self.config, self.att_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+        init_weights(
+            self.config, self.ff_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        if self._activation_checkpoint_fn is not None:
+            q, k, v = self.att_proj(self._activation_checkpoint_fn(self.attn_norm, x)).split(
+                self.fused_dims, dim=-1
+            )
+        else:
+            q, k, v = self.att_proj(self.attn_norm(x)).split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class LLaDALlamaBlock(LLaDABlock):
+    """
+    This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). This block is similar to `LLaDASequentialBlock`
+    but some operations have slightly different implementations to imitate the
+    behavior of Llama.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        self.__cache = cache
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        q_proj_out_dim = config.d_model
+        k_proj_out_dim = config.effective_n_kv_heads * head_dim
+        v_proj_out_dim = config.effective_n_kv_heads * head_dim
+        self.q_proj = nn.Linear(
+            config.d_model, q_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        self.k_proj = nn.Linear(
+            config.d_model, k_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        self.v_proj = nn.Linear(
+            config.d_model, v_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+        # new add
+        self.up_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(self.config, self.q_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.k_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.v_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.ff_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.up_proj, d=self.config.d_model, layer_id=None)  # new add
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        cat = 'cond',
+        to_compute_mask = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        B, T, D = x.shape
+        x_normed = self.attn_norm(x)
+        q = self.q_proj(x_normed)
+        k = self.k_proj(x_normed)
+        v = self.v_proj(x_normed)
+        if use_cache:
+            if cat not in self.cache['k']:
+                self.cache['k'][cat] = torch.zeros_like(x)
+                self.cache['v'][cat] = torch.zeros_like(x)
+            if to_compute_mask is not None:
+                self.cache['k'][cat][to_compute_mask] = k.view(-1, D)
+                self.cache['v'][cat][to_compute_mask] = v.view(-1, D)
+                k = self.cache['k'][cat]
+                v = self.cache['v'][cat]
+            else:
+                self.cache['k'][cat] = k
+                self.cache['v'][cat] = v
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past,
+                                        to_compute_mask=to_compute_mask)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x, x_up = self.ff_proj(x), self.up_proj(x) # new add
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = x * x_up # new add
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class LLaDAOutput(NamedTuple):
+    logits: torch.FloatTensor
+    """
+    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
+    for the next token *before* normalization via (log) softmax.
+    """
+    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
+    """
+    Attention keys and values from each block.
+    """
+    hidden_states: Optional[Tuple[torch.Tensor]]
+    """
+    Hidden states from each block.
+    """
+class LLaDAGenerateOutput(NamedTuple):
+    token_ids: torch.LongTensor
+    """
+    The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
+    These do *not* include the original input IDs.
+    """
+    scores: torch.FloatTensor
+    """
+    The scores of the generated sequences, a tensor of shape `(batch_size, beam_size)`.
+    """
+class LLaDABlockGroup(nn.ModuleList):
+    def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.config = config
+        self.layer_offset = layer_offset
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layers_past: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        for block_idx, block in enumerate(self):
+            layer_past = None if layers_past is None else layers_past[block_idx]
+            block_idx += self.layer_offset
+            if (
+                (self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.whole_layer)
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_two
+                    and block_idx % 2 == 0
+                )
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_three
+                    and block_idx % 3 == 0
+                )
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_four
+                    and block_idx % 4 == 0
+                )
+            ):
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = self._activation_checkpoint_fn(  # type: ignore
+                    block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
+                )
+            else:
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+        return x, attn_key_values
+    def reset_parameters(self):
+        for block in self:
+            block.reset_parameters()
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        self.activation_checkpointing_strategy = strategy
+        for block in self:
+            block.set_activation_checkpointing(strategy)
+class LLaDAModel(nn.Module):
+    def __init__(self, config: ModelConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
+        self.__cache = BufferCache()
+        # Validate config.
+        if self.config.alibi and self.config.flash_attention:
+            raise Exception("ALiBi is currently not supported with FlashAttention")
+        if self.config.alibi and self.config.rope:
+            raise Exception("ALiBi and RoPE are mutually exclusive")
+        if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
+            if self.config.embedding_size < self.config.vocab_size:
+                raise Exception("embedding size should be at least as big as vocab size")
+            elif self.config.embedding_size % 128 != 0:
+                import warnings
+                warnings.warn(
+                    "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
+                )
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn: Callable = activation_checkpoint_function(self.config)
+        if not (
+            0 < self.config.block_group_size <= self.config.n_layers
+            and self.config.n_layers % self.config.block_group_size == 0
+        ):
+            raise Exception("n layers must be divisible by block group size")
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(
+                    config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
+                ),
+                emb_drop=Dropout(config.embedding_dropout),
+                ln_f=LayerNorm.build(config),
+            )
+        )
+        blocks = [LLaDABlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        if self.config.block_group_size > 1:
+            block_groups = [
+                LLaDABlockGroup(config, i, blocks[i : i + config.block_group_size])
+                for i in range(0, config.n_layers, config.block_group_size)
+            ]
+            self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not (self.config.alibi or self.config.rope):
+            self.transformer.update(
+                {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
+            )
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.d_model,
+                        config.embedding_size or config.vocab_size,
+                        bias=config.include_bias,
+                        device=config.init_device,
+                    )
+                }
+            )
+        # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
+        if init_params and self.config.init_device != "meta":
+            self.reset_parameters()
+        self.__num_fwd_flops: Optional[int] = None
+        # Warm up cache.
+        if self.config.alibi:
+            get_causal_attention_bias(self.__cache, config.max_sequence_length, _non_meta_init_device(config))
+            self.get_alibi_attention_bias(config.max_sequence_length, _non_meta_init_device(config))
+        self.logit_cache = {}
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        self.activation_checkpointing_strategy = strategy
+        if self.config.block_group_size != 1:
+            for block_group in self.transformer.block_groups:
+                block_group.set_activation_checkpointing(strategy)
+        else:
+            for block in self.transformer.blocks:
+                block.set_activation_checkpointing(strategy)
+    @property
+    def device(self) -> torch.device:
+        device: torch.device = self.transformer.wte.weight.device  # type: ignore
+        if device.type == "meta":
+            return _non_meta_init_device(self.config)
+        else:
+            return device
+    def reset_parameters(self):
+        log.info("Initializing model parameters...")
+        # Top-level embeddings / linear layers.
+        init_weights(
+            self.config,
+            self.transformer.wte,  # type: ignore
+            std_factor=(0.5 * math.sqrt(self.config.d_model)) if self.config.scale_logits else 1.0,
+            type_of_module=ModuleType.emb,
+        )
+        if hasattr(self.transformer, "wpe"):
+            init_weights(self.config, self.transformer.wpe, type_of_module=ModuleType.emb)  # type: ignore
+        # Top-level layer norm.
+        self.transformer.ln_f.reset_parameters()  # type: ignore
+        # Output weights.
+        if hasattr(self.transformer, "ff_out"):
+            init_weights(self.config, self.transformer.ff_out, type_of_module=ModuleType.final_out)  # type: ignore
+        # Let the blocks handle themselves.
+        if self.config.block_group_size == 1:
+            for block in self.transformer.blocks:
+                block.reset_parameters()
+        else:
+            for block_group in self.transformer.block_groups:
+                block_group.reset_parameters()
+    def get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        if (alibi_bias := self.__cache.get("alibi_attention_bias")) is not None and alibi_bias.shape[
+            -1
+        ] >= seq_len:
+            if alibi_bias.device != device:
+                alibi_bias = alibi_bias.to(device)
+                self.__cache["alibi_attention_bias"] = alibi_bias
+            return alibi_bias
+        with torch.autocast(device.type, enabled=False):
+            alibi_bias = alibi_attention_bias(seq_len, self.config, device)
+        self.__cache["alibi_attention_bias"] = alibi_bias
+        return alibi_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        last_logits_only: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        use_cache = False,
+        to_compute_mask = None,
+        cat = '',
+    ) -> LLaDAOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        """
+        if use_cache and to_compute_mask is not None:
+            input_ids = input_ids[to_compute_mask].view(input_ids.shape[0], -1)
+        # Add Basic MDM Model config check
+        assert not self.config.alibi, "Alibi length extrapolation is not supported for MDM."
+        assert self.config.rope, "Rope must be used in Llama-Encoder for MDM."
+        # assert (past_key_values is None and not use_cache), "The kvcache is not suppotred for MDM."
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.n_layers
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        if self.config.input_emb_norm:
+            x = x * (self.config.d_model**0.5)
+        if not (self.config.alibi or self.config.rope):
+            # Get positional embeddings.
+            # shape: (1, seq_len)
+            pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+            # shape: (1, seq_len, d_model)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = pos_emb + x
+        # Add input + positional embeddings and apply dropout.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # Transform the attention mask into what the blocks expect.
+        if attention_mask is not None and 0.0 in attention_mask:
+            # shape: (batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+            attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+        else:
+            attention_mask = None
+        # Merge attention mask with attention bias.
+        if (
+            attention_bias is not None
+            or attention_mask is not None
+            or self.config.alibi
+            # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
+            # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
+            # scores correctly.
+            or past_key_values is not None
+        ):
+            if attention_bias is None and self.config.alibi:
+                attention_bias = get_causal_attention_bias(
+                    self.__cache, past_length + seq_len, x.device
+                ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
+            elif attention_bias is None:
+                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
+            elif attention_bias.dtype in (torch.int8, torch.bool):
+                attention_bias = attention_bias.to(dtype=torch.float)
+                attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+            # Transform to the right shape and data type.
+            mask_len = seq_len
+            if attention_mask is not None:
+                mask_len = attention_mask.shape[-1]
+            elif past_key_values is not None:
+                mask_len = past_key_values[0][0].shape[-2] + seq_len
+            attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+            # Add in the masking bias.
+            if attention_mask is not None:
+                attention_bias = attention_bias + attention_mask
+                # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+                # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+                # it can produce NaNs.
+                ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Apply blocks one-by-one.
+        if self.config.block_group_size == 1:
+            for block_idx, block in enumerate(self.transformer.blocks):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layer_past = None if past_key_values is None else past_key_values[block_idx]
+                if (
+                    (self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.whole_layer)
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_two
+                        and block_idx % 2 == 0
+                    )
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_three
+                        and block_idx % 3 == 0
+                    )
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_four
+                        and block_idx % 4 == 0
+                    )
+                ):
+                    # shape: (batch_size, seq_len, d_model)
+                    x, _ = self._activation_checkpoint_fn(
+                        block, x, attention_bias=attention_bias, layer_past=layer_past,
+                        to_compute_mask=to_compute_mask, use_cache=use_cache, cat=cat
+                    )
+                else:
+                    # shape: (batch_size, seq_len, d_model)
+                    LLaDALlamaBlock.forward
+                    x, _ = block(x, attention_bias=attention_bias, layer_past=layer_past,
+                        to_compute_mask=to_compute_mask, use_cache=use_cache, cat=cat
+                    )
+        else:
+            for group_idx, block_group in enumerate(self.transformer.block_groups):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layers_past = (
+                    None
+                    if past_key_values is None
+                    else past_key_values[
+                        group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                    ]
+                )
+                x, _ = block_group(
+                    x, attention_bias=attention_bias, layers_past=layers_past,
+                    to_compute_mask=to_compute_mask, use_cache=use_cache, cat=cat
+                )
+                # if attn_key_values is not None:
+                #     assert cache is not None
+                #     attn_key_values.extend(cache)
+        if last_logits_only:
+            # shape: (batch_size, 1, d_model)
+            x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.d_model))
+        if use_cache:
+            if cat not in self.logit_cache:
+                self.logit_cache[cat] = torch.zeros_like(logits)
+            if to_compute_mask is not None:
+                self.logit_cache[cat][to_compute_mask] = logits.view(-1, logits.shape[-1])
+                logits = self.logit_cache[cat]
+            else:
+                self.logit_cache[cat] = logits
+        return LLaDAOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+    def caching(self, enable: bool = True):
+        LLaDABlock.caching
+        for block in self.transformer.blocks:
+            block.caching(enable)
+        self.logit_cache = {}
+    def empty_cache(self):
+        for block in self.transformer.blocks:
+            block.init_cache()
+        self.logit_cache = {}
+def create_model_config_from_pretrained_config(config: LLaDAConfig):
+    """
+    Utility function
+    """
+    kwargs = {}
+    for field in fields(ModelConfig):
+        kwargs[field.name] = getattr(config, field.name)
+    model_config = ModelConfig(**kwargs)
+    return model_config
+class LLaDAModelLM(PreTrainedModel):
+    """
+    Extremely barebones HF model wrapper.
+    """
+    config_class = LLaDAConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["LLaDABlock", "LLaDASequentialBlock", "LLaDALlamaBlock"]
+    def __init__(self, config: LLaDAConfig, model: Optional[LLaDAModel] = None, init_params: bool = False):
+        super().__init__(config)
+        if not model:
+            model_config = create_model_config_from_pretrained_config(config)
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            model_config.init_device = "cpu"
+            self.model = LLaDAModel(model_config, init_params=init_params)
+        else:
+            self.model = model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[Cache] = None,  # This is a hack mitigation of an issue in transformers `4.39.x`
+        use_cache = False,
+        to_compute_mask = None,
+        cat = '',
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in LLaDA")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            input_embeddings=inputs_embeds,
+            attention_mask=attention_mask,
+            attention_bias=attention_bias,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            to_compute_mask=to_compute_mask,
+            cat=cat,
+        )
+        logits = outputs.logits
+        hidden_states = outputs.hidden_states
+        loss = None
+        if labels is not None:
+            import warnings
+            warnings.warn("Note that for LLaDA, you cannot calculate the loss here.", UserWarning)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=outputs.attn_key_values,
+            hidden_states=hidden_states,
+        )
+    def can_generate(self) -> bool:
+        return True
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values:
+            # This is because we want the model to only process the last generated token.
+            input_ids = input_ids[:, -1:]
+        model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
+        model_inputs.update(kwargs)
+        model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
+        return model_inputs
+    # TODO: these are required to make the implementation complete.
+    # def resize_position_embeddings(self, new_num_position_embeddings: int):
+    #     pass
+    #
+    # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
+    #     pass
+    #
+    # def _reorder_cache(self, past_key_values, beam_idx):
+    #     pass
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module):
+        self.model.transformer.wte = value
+    def get_output_embeddings(self):
+        if self.config.weight_tying:
+            return self.model.transformer.wte
+        else:
+            return self.model.transformer.ff_out
+    def set_output_embeddings(self, value: torch.nn.Module):
+        if self.config.weight_tying:
+            self.model.transformer.wte = value
+        else:
+            self.model.transformer.ff_out = value
+    def tie_weights(self):
+        if self.config.weight_tying:
+            self.model.transformer.ff_out = self.model.transformer.wte
+    def caching(self, enable: bool = True):
+        self.model.caching(enable)
+    def empty_cache(self):
+        self.model.empty_cache()
+# Register the model so that it is available for transformer pipelines, auto-loading, etc.
+AutoModel.register(LLaDAConfig, LLaDAModelLM)

model/modeling_xllmx_dimoo.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import functools
+import logging
+import math
+from typing import List, Dict, Tuple, Optional
+import torch.nn.functional as F
+import torch
+from torch import nn
+from transformers import AutoTokenizer, AutoConfig
+from .modeling_llada import LLaDAModelLM
+from .configuration_llada import LLaDAConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+__all__ = ["LLaDAForMultiModalGeneration"]
+def create_attention_mask(original_lengths, max_tokens, device):
+    batch_size = len(original_lengths)
+    attention_mask = torch.zeros(batch_size, max_tokens, dtype=torch.bool, device=device)
+    for i, length in enumerate(original_lengths):
+        attention_mask[i, :length] = 1
+    return attention_mask
+class LLaDAForMultiModalGeneration(LLaDAModelLM):
+    config_class = LLaDAConfig
+    base_model_prefix = "model"
+    IMAGE_START_TOKEN = 126349
+    IMAGE_END_TOKEN = 126350
+    ANSWER_START_TOKEN = 126354
+    ANSWER_END_TOKEN = 126355
+    BREAKLINE_TOKEN = 126084
+    MASK_TOKEN = 126336
+    PAD_TOKEN = 126339
+    def __init__(self, config: LLaDAConfig, *args, **kwargs):
+        print(f"Initializing LLaDAForMultiModalGeneration with config: {config}")
+        super().__init__(config, *args, **kwargs)
+        self._debug_step = 0
+    def forward(
+        self,
+        input_ids=None,
+        labels=None,
+        infer=False,
+        use_cache=False,
+        return_dict=False,
+        compute_separate_losses=True,
+        t=None,
+        text_coeff=1.0,
+        image_coeff=1.0,
+    ):
+        if infer:
+            input_ids = input_ids.tolist()
+        max_tokens = max([len(_) for _ in input_ids])
+        original_lengths = [len(example) for example in input_ids]
+        input_ids = [example + [0] * (max_tokens - len(example)) for example in input_ids]
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=self.device)
+        attention_mask = create_attention_mask(original_lengths, max_tokens, self.device)
+        attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+        output = LLaDAModelLM.forward(
+            self,
+            input_ids=input_ids,
+            attention_bias=attention_bias,
+            use_cache=use_cache
+        )
+        if infer:
+            return output
+        if labels is None:
+            if return_dict:
+                return {'logits': output.logits}
+            else:
+                return output.logits
+        labels = [label + [-100] * (max_tokens - len(label)) for label in labels]
+        labels = torch.tensor(labels, dtype=torch.int64, device=self.device)
+        logits = output.logits
+        batch_size = logits.shape[0]
+        unscaled_loss = F.cross_entropy(
+            logits.contiguous().view(-1, logits.shape[-1]),
+            labels.contiguous().view(-1),
+            ignore_index=-100,
+            reduction='none'
+        ).view(batch_size, -1)
+        valid_mask = (labels != -100)
+        if valid_mask.sum() > 0:
+            interleave_loss = unscaled_loss[valid_mask].mean()
+        else:
+            interleave_loss = torch.tensor(0.0, device=self.device)
+        if compute_separate_losses:
+            self._debug_step += 1
+            debug_this_step = (self._debug_step <= 3)
+            if debug_this_step:
+                print(f"\n{'='*80}")
+                print(f"DEBUG Step {self._debug_step}")
+                print(f"{'='*80}")
+            text_loss_list = []
+            image_loss_list = []
+            for b in range(batch_size):
+                answer_start_positions = (input_ids[b] == self.ANSWER_START_TOKEN).nonzero(as_tuple=True)[0]
+                if len(answer_start_positions) == 0:
+                    continue
+                answer_start = answer_start_positions[0].item()
+                answer_end_in_search = (input_ids[b, answer_start:] == self.ANSWER_END_TOKEN).nonzero(as_tuple=True)[0]
+                if len(answer_end_in_search) > 0:
+                    answer_end = answer_start + answer_end_in_search[0].item()
+                else:
+                    answer_end = original_lengths[b]
+                answer_region_input = input_ids[b, answer_start:answer_end]
+                image_start_in_answer = (answer_region_input == self.IMAGE_START_TOKEN).nonzero(as_tuple=True)[0]
+                if len(image_start_in_answer) > 0:
+                    image_start_pos = answer_start + image_start_in_answer[0].item()
+                    image_end_search = input_ids[b, image_start_pos:]
+                    image_end_in_search = (image_end_search == self.IMAGE_END_TOKEN).nonzero(as_tuple=True)[0]
+                    if len(image_end_in_search) > 0 :
+                        image_end_pos = image_start_pos + image_end_in_search[0].item()
+                        for pos in range(image_start_pos + 1, image_end_pos):
+                            if input_ids[b, pos] != self.BREAKLINE_TOKEN:
+                                image_loss_list.append(unscaled_loss[b, pos])
+                        for pos in range(image_end_pos + 1, answer_end):
+                            if labels[b, pos] != -100:
+                                text_loss_list.append(unscaled_loss[b, pos])
+                else:
+                    for pos in range(answer_start + 1, answer_end):
+                        if labels[b, pos] != -100:
+                            text_loss_list.append(unscaled_loss[b, pos])
+            if debug_this_step:
+                print(f"Total text_loss_list length: {len(text_loss_list)}")
+                print(f"Total image_loss_list length: {len(image_loss_list)}")
+                if len(text_loss_list) > 0:
+                    non_zero_text = [l.item() for l in text_loss_list if l.item() > 0]
+                    print(f"Non-zero text losses count: {len(non_zero_text)}/{len(text_loss_list)}")
+                    print(f"Sample non-zero text losses: {non_zero_text[:5]}")
+                if len(image_loss_list) > 0:
+                    non_zero_image = [l.item() for l in image_loss_list if l.item() > 0]
+                    print(f"Non-zero image losses count: {len(non_zero_image)}/{len(image_loss_list)}")
+                    print(f"Sample non-zero image losses: {non_zero_image[:5]}")
+                print(f"{'='*80}\n")
+            if len(text_loss_list) > 0:
+                text_loss = torch.stack(text_loss_list).mean()
+            else:
+                text_loss = torch.tensor(0.0, device=self.device)
+            if len(image_loss_list) > 0:
+                image_loss = torch.stack(image_loss_list).mean()
+            else:
+                image_loss = torch.tensor(0.0, device=self.device)
+            if t is not None and len(text_loss_list) > 0:
+                text_loss = text_loss / t.mean().clamp(min=0.01)
+            if return_dict:
+                return {
+                    'logits': logits,
+                    'loss': interleave_loss,
+                    'interleave_loss': interleave_loss,
+                    'text_loss': text_loss,
+                    'image_loss': image_loss,
+                    'labels': labels,
+                }
+            else:
+                return interleave_loss, {
+                    'text_loss': text_loss,
+                    'image_loss': image_loss,
+                    'interleave_loss': interleave_loss,
+                }
+        else:
+            if return_dict:
+                return {'logits': logits, 'loss': interleave_loss, 'labels': labels}
+            else:
+                return interleave_loss
+    def get_fsdp_wrap_module_list(self) -> List:
+        modules = [*list(self.model.transformer.blocks), self.model.transformer.ff_out]
+        return modules
+    def get_checkpointing_wrap_module_list(self) -> List:
+        return list(self.model.transformer.blocks)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# -*- coding: utf-8 -*-
+"""
+Utility modules
+"""

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (223 Bytes). View file

utils/__pycache__/generation_utils.cpython-311.pyc ADDED Viewed

Binary file (5.68 kB). View file

utils/__pycache__/image_utils.cpython-311.pyc ADDED Viewed

Binary file (15.4 kB). View file

utils/__pycache__/prompt_utils.cpython-311.pyc ADDED Viewed

Binary file (7.9 kB). View file

utils/generation_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# -*- coding: utf-8 -*-
+"""
+Generation related utility functions
+"""
+import math
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import Callable, Optional
+def add_gumbel_noise(logits, temperature):
+    """
+    Gumbel noise addition function
+    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality
+    Therefore using float64
+    """
+    if temperature == 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    gumbel_noise = (- torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+def cosine_schedule(t: torch.Tensor) -> torch.Tensor:
+    """Cosine schedule function: m(t) = cos(π/2 · t) – MaskGit paper Eq.(3)"""
+    return torch.cos(0.5 * math.pi * t)
+def gumbel_noise(t: torch.Tensor, *, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+    """Return i.i.d. Gumbel(0,1) noise with same shape as t"""
+    if generator is None:
+        u = torch.rand_like(t)
+    else:
+        u = torch.rand(t.shape, device=t.device, dtype=t.dtype, generator=generator)
+    return -torch.log(-torch.log(u + 1e-20) + 1e-20)
+def gumbel_max_sample(logits: torch.Tensor, tau: float = 1.0, *, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+    """Sample from categorical(logits) via Gumbel-Max. τ=0 → greedy argmax"""
+    if tau == 0.0:
+        return logits.argmax(dim=-1)
+    g = gumbel_noise(logits, generator=generator)
+    return (logits / tau + g).argmax(dim=-1)
+def mask_by_random_topk(
+    mask_len: torch.Tensor,     # (B,) number of tokens to keep masked
+    probs: torch.Tensor,        # (B, L) sampled token probability
+    *,
+    temperature: float = 1.0,
+    generator: Optional[torch.Generator] = None,
+) -> torch.BoolTensor:
+    """Return Boolean mask – True means *stay masked* for next step"""
+    g = gumbel_noise(probs, generator=generator)
+    confidence = torch.log(probs.clamp_min(1e-20)) + temperature * g  # higher = more confident
+    sorted_conf = torch.sort(confidence, dim=-1).values               # ascending
+    k = mask_len.long().unsqueeze(1).clamp_(0, probs.size(1) - 1)
+    cut_off = torch.gather(sorted_conf, 1, k)                         # (B,1)
+    return confidence < cut_off                                       # (B,L)
+def get_num_transfer_tokens(mask_index, steps):
+    """
+    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals
+    Since LLaDA employs a linear noise schedule (as defined in Eq.(8)),
+    the expected number of tokens transitioned at each step should be consistent
+    This function is designed to precompute the number of tokens that need to be transitioned at each step
+    """
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+    for i in range(mask_num.size(0)):
+        num_transfer_tokens[i, :remainder[i]] += 1
+    return num_transfer_tokens
+def setup_seed(seed: int):
+    """Set random seed"""
+    import random
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# -*- coding: utf-8 -*-
+"""
+Image processing utilities
+"""
+import torch
+import PIL
+import random
+from PIL import Image, ImageDraw
+from diffusers import VQModel
+from diffusers.image_processor import VaeImageProcessor
+import torch.nn.functional as F
+def decode_vq_to_image(
+    vq_codes: torch.LongTensor,
+    save_path: str = None,
+    vae_ckpt: str = None,
+    image_height: int = 512,
+    image_width: int = 512,
+    vqvae: VQModel = None
+) -> Image.Image:
+    """
+    Decode VQ codes to image
+    Args:
+        vq_codes: VQ codes in range [0, codebook_size), shape [batch_size, seq_len]
+        save_path: Save path (optional, if None will not save to file)
+        vae_ckpt: VAE checkpoint path (optional if vqvae is provided)
+        image_height: Image height
+        image_width: Image width
+        vqvae: VQ-VAE model, if None will load from vae_ckpt
+    Returns:
+        PIL image
+    """
+    device = vq_codes.device
+    if vqvae is None:
+        vqvae = VQModel.from_pretrained(vae_ckpt, subfolder="vqvae").to(device)
+    scale = 2 ** (len(vqvae.config.block_out_channels) - 1)
+    img_proc = VaeImageProcessor(vae_scale_factor=scale, do_normalize=False)
+    # Calculate latent space grid size
+    latent_height = image_height // scale
+    latent_width = image_width // scale
+    # Ensure VQ codes length matches
+    expected_len = latent_height * latent_width
+    if vq_codes.shape[1] != expected_len:
+        raise ValueError(
+            f"VQ codes length mismatch: {vq_codes.shape[1]} != {expected_len} "
+            f"for image size ({image_height},{image_width}) with scale {scale}"
+        )
+    # Reshape to 2D grid: [batch_size, seq_len] -> [batch_size, latent_height, latent_width]
+    # vq_codes should already be in range [0, codebook_size), no offset needed
+    latents = vq_codes.view(vq_codes.shape[0], latent_height, latent_width).long()
+    # latents = (vq_codes.view(1, latent_height, latent_width) - 126356).long()
+    # Decode
+    recon = vqvae.decode(
+        latents,
+        force_not_quantize=True,
+        shape=(vq_codes.shape[0], latent_height, latent_width, vqvae.config.latent_channels),
+    ).sample.clip(0, 1)
+    # Post-process
+    img = img_proc.postprocess(recon.detach(), output_type="pil")[0]
+    # Save image (only if save_path is provided)
+    if save_path is not None:
+        img.save(save_path)
+    return img
+def preprocess_image(image_path: str, target_size: tuple = (512, 512)):
+    """
+    Preprocess image: load, crop, resize
+    Args:
+        image_path: Image path
+        target_size: Target size (width, height)
+    Returns:
+        Processed PIL image
+    """
+    img = Image.open(image_path).convert("RGB")
+    crop_size_list = generate_crop_size_list((target_size[0] // 32) ** 2, 32)
+    processed_img = var_center_crop(img, crop_size_list=crop_size_list)
+    return processed_img
+def calculate_vq_params(image_height: int, image_width: int, vae_scale: int = 16):
+    """
+    Calculate VQ related parameters
+    Args:
+        image_height: Image height
+        image_width: Image width
+        vae_scale: VAE scale factor
+    Returns:
+        seq_len, newline_every, token_grid_height, token_grid_width
+    """
+    token_grid_height = image_height // vae_scale
+    token_grid_width = image_width // vae_scale
+    seq_len = token_grid_height * token_grid_width
+    newline_every = token_grid_width
+    return seq_len, newline_every, token_grid_height, token_grid_width
+def center_crop(pil_image, crop_size):
+    while pil_image.size[0] >= 2 * crop_size[0] and pil_image.size[1] >= 2 * crop_size[1]:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
+    scale = max(crop_size[0] / pil_image.size[0], crop_size[1] / pil_image.size[1])
+    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
+    crop_left = random.randint(0, pil_image.size[0] - crop_size[0])
+    crop_upper = random.randint(0, pil_image.size[1] - crop_size[1])
+    crop_right = crop_left + crop_size[0]
+    crop_lower = crop_upper + crop_size[1]
+    return pil_image.crop(box=(crop_left, crop_upper, crop_right, crop_lower))
+def var_center_crop(pil_image, crop_size_list, random_top_k=1):
+    w, h = pil_image.size
+    rem_percent = [min(cw / w, ch / h) / max(cw / w, ch / h) for cw, ch in crop_size_list]
+    crop_size = random.choice(
+        sorted(((x, y) for x, y in zip(rem_percent, crop_size_list)), reverse=True)[:random_top_k]
+    )[1]
+    return center_crop(pil_image, crop_size)
+def generate_crop_size_list(num_patches, patch_size, max_ratio=4.0):
+    assert max_ratio >= 1.0
+    crop_size_list = []
+    wp, hp = num_patches, 1
+    while wp > 0:
+        if max(wp, hp) / min(wp, hp) <= max_ratio:
+            crop_size_list.append((wp * patch_size, hp * patch_size))
+        if (hp + 1) * wp <= num_patches:
+            hp += 1
+        else:
+            wp -= 1
+    return crop_size_list
+def add_break_line(sequence: list, H: int, W: int, new_number: int = 0) -> list:
+    """Add newline characters to sequence"""
+    result = []
+    for i in range(H):
+        start = i * W
+        end = start + W
+        row = sequence[start:end]
+        result.extend(row + [new_number])
+    return result
+def encode_img_with_breaks(img, vqvae, vae_scale_factor: int = 16):
+    """Encode image and add newline characters"""
+    from diffusers.image_processor import VaeImageProcessor
+    orig = img.convert("RGB")
+    orig_resized = orig
+    image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_normalize=False)
+    x = image_processor.preprocess(orig_resized).to(vqvae.device)
+    latents = vqvae.encode(x).latents
+    latents_bsz, channels, lat_h, lat_w = latents.shape
+    quantized = vqvae.quantize(latents)[2][2] + 126356
+    quantized = quantized.reshape(latents_bsz, lat_h, lat_w).flatten().tolist()
+    img_token = add_break_line(quantized, lat_h, lat_w, new_number=126084)
+    img_token = [126349] + img_token + [126350]
+    return img_token
+@torch.no_grad()
+def encode_img_with_paint(
+    img: Image.Image,
+    vqvae: VQModel,
+    *,
+    mask_h_ratio: float = 1,   # Height ratio
+    mask_w_ratio: float = 0.2,    # Width ratio
+    gray_value: int = 127,        # Visualization gray value
+    downsample_mode: str = "area",# Pixel mask alignment to latent grid
+    dilate_latent_k: int = 0,     # Optional dilation on latent grid (grid count)
+    mask_mode: str = "inpainting",   # "inpainting" | "outpainting"
+):
+    """
+    Encode image with mask for inpainting/outpainting tasks
+    Args:
+        img: Input PIL image
+        vqvae: VQ-VAE model for encoding
+        mask_h_ratio: Height ratio for mask region (default: 1.0)
+        mask_w_ratio: Width ratio for mask region (default: 0.2)
+        gray_value: Gray value for mask visualization (default: 127)
+        downsample_mode: Downsampling mode for mask alignment ("area", "nearest", "bilinear")
+        dilate_latent_k: Dilation kernel size for latent grid (default: 0)
+        mask_mode: Mask mode - "inpainting" (mask inside) or "outpainting" (mask outside)
+    Returns:
+        img_token: List[int] - Token sequence with newlines (126084) inserted at row ends;
+                              masked positions = 126336, others = index + 126356
+        vis_img: PIL.Image - Gray mask visualization image (consistent with mask_mode)
+    Note:
+        * Encoding uses original image strictly; mask only maps to latent grid to determine
+          which tokens are set to MASK_TOKEN_ID.
+        * mask_mode="inpainting": mask inside rectangle; "outpainting": mask outside rectangle (inverse).
+    """
+    MASK_TOKEN_ID = 126336      # mask token
+    NEWLINE_TOKEN_ID = 126084   # newline token
+    VQ_OFFSET = 126356          # quantization index offset
+    assert mask_mode in ("inpainting", "outpainting"), "mask_mode must be 'inpainting' or 'outpainting'"
+    # --- 1) Calculate center rectangle and generate visualization ---
+    img = img.convert("RGB")
+    W, H = img.size
+    mh = int(round(H * mask_h_ratio))
+    mw = int(round(W * mask_w_ratio))
+    top = (H - mh) // 2
+    left = (W - mw) // 2
+    bottom = top + mh
+    right = left + mw
+    if mask_mode == "inpainting":
+        vis_img = img.copy()
+        draw = ImageDraw.Draw(vis_img)
+        draw.rectangle([left, top, right, bottom], fill=(gray_value, gray_value, gray_value))
+    elif mask_mode == "outpainting":  # outpainting
+        bg = Image.new("RGB", (W, H), (gray_value, gray_value, gray_value))
+        crop = img.crop((left, top, right, bottom))
+        bg.paste(crop, (left, top))
+        vis_img = bg
+    # --- 2) VQ encoding using original image ---
+    vae_scale_factor = 2 ** (len(vqvae.config.block_out_channels) - 1)
+    image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_normalize=False)
+    x = image_processor.preprocess(img).to(vqvae.device)  # 1 x 3 x H' x W'
+    latents = vqvae.encode(x).latents                     # 1 x C x h x w
+    _, _, lat_h, lat_w = latents.shape
+    # Quantization indices
+    quant_pack = vqvae.quantize(latents)
+    indices = quant_pack[2][2].view(1, lat_h, lat_w)      # 1 x h x w, long
+    # --- 3) Pixel mask -> latent grid mask (aligned with encoding input size) ---
+    Hp, Wp = x.shape[-2:]
+    mask_px = torch.zeros((1, 1, Hp, Wp), dtype=torch.float32, device=vqvae.device)
+    # First generate mask where "rectangle inside=1, outside=0"
+    top_p  = int(round(top  * Hp / H))
+    left_p = int(round(left * Wp / W))
+    bh_p   = int(round(mh   * Hp / H))
+    bw_p   = int(round(mw   * Wp / W))
+    mask_px[:, :, top_p:top_p+bh_p, left_p:left_p+bw_p] = 1.0
+    # If outpainting, need to invert (outside=1, inside=0 is the masked region)
+    if mask_mode == "outpainting":
+        mask_px = 1.0 - mask_px
+    if downsample_mode not in ("nearest", "area", "bilinear"):
+        downsample_mode = "area"
+    mask_lat = F.interpolate(mask_px, size=(lat_h, lat_w), mode=downsample_mode)
+    mask_lat = (mask_lat > 0.5) if downsample_mode == "area" else (mask_lat >= 0.5)
+    mask_lat = mask_lat[0, 0]        # h x w (bool)
+    # Optional: latent grid dilation (after inversion is applied)
+    if dilate_latent_k > 0:
+        m = mask_lat.float().unsqueeze(0).unsqueeze(0)
+        ker = 2 * dilate_latent_k + 1
+        m = F.max_pool2d(m, kernel_size=ker, stride=1, padding=dilate_latent_k)
+        mask_lat = (m[0, 0] > 0.5)
+    # --- 4) Generate tokens: masked positions=MASK_TOKEN_ID, others=indices+VQ_OFFSET ---
+    idx_flat = indices.view(-1)
+    mask_flat = mask_lat.view(-1)
+    tokens = torch.empty_like(idx_flat)
+    tokens[mask_flat] = MASK_TOKEN_ID
+    tokens[~mask_flat] = idx_flat[~mask_flat] + VQ_OFFSET
+    tokens_list = tokens.tolist()
+    # --- 5) Insert newlines (no longer wrapped in <boi>/<eoi>, consistent with current return) ---
+    img_token = add_break_line(tokens_list, lat_h, lat_w, NEWLINE_TOKEN_ID)
+    return img_token, vis_img

utils/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# -*- coding: utf-8 -*-
+"""
+Prompt generation utilities for different inference types
+"""
+from typing import Dict, List, Tuple, Optional
+def create_prompt_templates():
+    """Create prompt templates for various tasks"""
+    templates = {
+        "text_understanding": "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.",
+        "image_generation": "Generate an image according to the text prompt.",
+        "image_editing": "Generate an image applying the following editing instruction based on the original image.",
+        "dense_prediction": "Perform dense prediction on the given images.",
+        "control_generation": "Generate an image according to the text prompt and the given control image.",
+        "subject_generation": "Generate an image according to the text prompt and the given object image.",
+        "multi_view": "Generate a view-image based on the given image.",
+        "style_transfer": "Transform the current image into the style of the provided image."
+    }
+    return templates
+def generate_text_to_image_prompt(prompt_text: str, templates: Optional[Dict] = None) -> Tuple[str, str]:
+    """
+    Generate prompt for text-to-image generation
+    Args:
+        prompt_text: User input text prompt
+        templates: Optional prompt templates dict
+    Returns:
+        Tuple of (input_prompt, unconditional_prompt)
+    """
+    if templates is None:
+        templates = create_prompt_templates()
+    system_prompt = templates["image_generation"]
+    input_prompt = "<system>" + system_prompt + "</system>" + "<user>" + prompt_text + "</user>"
+    uncon_prompt = "<system>" + system_prompt + "</system>" + "<user>" + "<uncondition>" + "</user>"
+    return input_prompt, uncon_prompt
+def generate_image_to_image_prompt(
+    prompt_text: str,
+    edit_type: str,
+    templates: Optional[Dict] = None,
+    **kwargs
+) -> Tuple[str, str, str]:
+    """
+    Generate prompt for image-to-image generation
+    Args:
+        prompt_text: User input text prompt
+        edit_type: Type of editing operation
+        templates: Optional prompt templates dict
+        **kwargs: Additional parameters for specific edit types
+    Returns:
+        Tuple of (input_prompt, unconditional_prompt, system_prompt)
+    """
+    if templates is None:
+        templates = create_prompt_templates()
+    # Determine system prompt and processed prompt text based on edit type
+    if 'dense' in edit_type:
+        des = {
+            "canny": "canny edge map",
+            "hed": "hed edge map",
+            "normal": "normal map",
+            "sam2mask": "sam2 mask",
+            "depth": "depth map",
+            "openpose": "pose estimation map"
+        }
+        system_prompt = templates["dense_prediction"]
+        prompt_text_used = f"Generate a {des.get(edit_type.split('_')[0], 'dense map')} according to the image."
+    elif 'control' in edit_type:
+        system_prompt = templates["control_generation"]
+        prompt_text_used = prompt_text
+    elif 'subject' in edit_type:
+        system_prompt = templates["subject_generation"]
+        prompt_text_used = prompt_text
+    elif 'edit' in edit_type:
+        system_prompt = templates["image_editing"]
+        prompt_text_used = prompt_text
+    elif "ref_transfer" in edit_type:
+        system_prompt = templates["style_transfer"]
+        prompt_text_used = "Transform the current image into the style of the provided image."
+    elif 'multi_view' in edit_type:
+        system_prompt = templates["multi_view"]
+        prompt_text_used = f"Generate the {edit_type.split('_')[-1]} view based on the provided front view."
+    else:
+        system_prompt = "Generate an image according to the prompt and image."
+        prompt_text_used = prompt_text
+    # Build final prompts
+    input_prompt = "<system>" + system_prompt + "</system>" + "<user>" + prompt_text_used + "</user>"
+    uncon_prompt = "<system>" + system_prompt + "</system>" + "<user>" + "<uncondition>" + "</user>"
+    return input_prompt, uncon_prompt, system_prompt
+def generate_multimodal_understanding_prompt(question: str, templates: Optional[Dict] = None) -> str:
+    """
+    Generate prompt for multimodal understanding (MMU)
+    Args:
+        question: User question about the image
+        templates: Optional prompt templates dict
+    Returns:
+        Formatted input prompt
+    """
+    if templates is None:
+        templates = create_prompt_templates()
+    system_prompt = "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer."
+    input_prompt = "<system>" + system_prompt + "</system>" + "<user>" + question + "</user>"
+    return input_prompt
+def get_edit_type_specific_prompt(edit_type: str, prompt_text: str, templates: Optional[Dict] = None) -> str:
+    """
+    Get edit type specific prompt text
+    Args:
+        edit_type: Type of editing operation
+        prompt_text: Original prompt text
+        templates: Optional prompt templates dict
+    Returns:
+        Processed prompt text for the specific edit type
+    """
+    if templates is None:
+        templates = create_prompt_templates()
+    if 'dense' in edit_type:
+        des = {
+            "canny": "canny edge map",
+            "hed": "hed edge map",
+            "normal": "normal map",
+            "sam2mask": "sam2 mask",
+            "depth": "depth map",
+            "openpose": "pose estimation map"
+        }
+        return f"Generate a {des.get(edit_type.split('_')[0], 'dense map')} according to the image."
+    elif 'control' in edit_type:
+        return prompt_text
+    elif 'subject' in edit_type:
+        return prompt_text
+    elif 'edit' in edit_type:
+        if "multiturn" in edit_type:
+            ids = int(edit_type.split("_")[-1])
+            if ids == 0:
+                return prompt_text[0] if isinstance(prompt_text, list) else prompt_text
+            else:
+                return prompt_text[ids][0] if isinstance(prompt_text[ids], list) else prompt_text[ids]
+        else:
+            return prompt_text
+    elif "ref_transfer" in edit_type:
+        return "Transform the current image into the style of the provided image."
+    elif 'multi_view' in edit_type:
+        return f"Generate the {edit_type.split('_')[-1]} view based on the provided front view."
+    else:
+        return prompt_text
+def get_system_prompt_for_edit_type(edit_type: str, templates: Optional[Dict] = None) -> str:
+    """
+    Get system prompt for specific edit type
+    Args:
+        edit_type: Type of editing operation
+        templates: Optional prompt templates dict
+    Returns:
+        System prompt for the edit type
+    """
+    if templates is None:
+        templates = create_prompt_templates()
+    if 'dense' in edit_type:
+        return templates["dense_prediction"]
+    elif 'control' in edit_type:
+        return templates["control_generation"]
+    elif 'subject' in edit_type:
+        return templates["subject_generation"]
+    elif 'edit' in edit_type:
+        return templates["image_editing"]
+    elif "ref_transfer" in edit_type:
+        return templates["style_transfer"]
+    elif 'multi_view' in edit_type:
+        return templates["multi_view"]
+    else:
+        return "Generate an image according to the prompt and image."
+def generate_text_image_to_text_image_prompt(prompt_text, system_prompt):
+    """
+    Generate prompts for TI2TI tasks
+    Args:
+        prompt_text: User's editing instruction
+        system_prompt: System prompt for the task
+    Returns:
+        input_prompt: Conditional prompt
+        uncon_text: Unconditional prompt
+    """
+    # Conditional prompt
+    input_prompt = (
+        f"<system>{system_prompt}</system>"
+        f"<user>{prompt_text}</user>"
+    )
+    # Unconditional prompt (for CFG)
+    uncon_text = (
+        f"<system>{system_prompt}</system>"
+        f"<user><uncondition></user>"
+    )
+    return input_prompt, uncon_text