Spaces:

Gen-Verse
/

MMaDA

Running

App Files Files Community

YucYux commited on Jun 6

Commit

5954d37

1 Parent(s): 0e83169

fixed model loading bug

Browse files

Files changed (1) hide show

app.py +337 -299

app.py CHANGED Viewed

@@ -83,7 +83,7 @@ def _load_model_and_tokenizer_core(model_path_to_load, model_display_name_for_st
     TOKENIZER = AutoTokenizer.from_pretrained(model_path_to_load, trust_remote_code=True)
     status_msg_parts.append(f"Tokenizer for '{model_display_name_for_status}' loaded.")
-    MODEL = MMadaModelLM.from_pretrained(model_path_to_load, trust_remote_code=True, torch_dtype=torch.bfloat16).to(DEVICE).eval()
     status_msg_parts.append(f"Model '{model_display_name_for_status}' loaded to {DEVICE}.")
     uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
@@ -264,35 +264,49 @@ def generate_viz_wrapper_t2i(prompt_text, steps, guidance_scale, mask_schedule="
     if MODEL is None or TOKENIZER is None or MASK_ID is None:
         yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
         return
-    steps = int(steps)
-    guidance_scale = float(guidance_scale)
-    image_tokens = torch.ones((1, 1024), dtype=torch.long, device=DEVICE) * MASK_ID
-    prompt_text = [prompt_text]
-    input_ids, attention_mask = uni_prompting((prompt_text, image_tokens), 't2i_gen')
-    if guidance_scale > 0:
-        uncond_input_ids, uncond_attention_mask = uni_prompting(([''], image_tokens), 't2i_gen')
-    else:
-        uncond_input_ids, uncond_attention_mask = None, None
-    mask_schedule = get_mask_schedule(mask_schedule)
-    blank_image = Image.new("RGB", (512, 512), (255, 255, 255))
-    yield blank_image, "Starting generation..."
-    for image_step, status_msg_step in MODEL.t2i_generate_decoding_stepwise(
-            input_ids = input_ids,
-            uncond_input_ids = uncond_input_ids,
-            attention_mask = attention_mask,
-            uncond_attention_mask = uncond_attention_mask,
-            temperature=1.0,
-            timesteps = steps,
-            guidance_scale = guidance_scale,
-            noise_schedule = mask_schedule,
-            noise_type = "mask",
-            seq_len = 1024,
-            vq_model = VQ_MODEL,
-            uni_prompting=uni_prompting):
-        yield image_step, status_msg_step
@@ -306,149 +320,160 @@ def generate_viz_wrapper_lm(prompt_text, steps, gen_length, block_length, temper
         yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
         return
-    steps = int(steps)
-    gen_length = int(gen_length)
-    block_length = int(block_length)
-    if thinking_mode_lm:
-        prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
     try:
-        m = [{"role": "user", "content": prompt_text}]
-        processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
-    except Exception as e:
-        yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
-        processed_prompt_text = prompt_text
-    try:
-        if TOKENIZER.pad_token_id is None:
-            if TOKENIZER.eos_token_id is not None:
-                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
-            else: # Should have been caught by load_model, but double check
-                 yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
-                 return
-        input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
-        raw_prompt_attention_mask = None
-    except Exception as e:
-        yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
-        return
-    batch_size = input_ids.shape[0]
-    prompt_len = input_ids.shape[1]
-    x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
-    x[:, :prompt_len] = input_ids.clone()
-    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
-    if gen_length == 0:
-         final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
-         yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
-         return
-    if block_length <= 0 or gen_length % block_length != 0 :
-        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
-              f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
-        return
-    num_blocks = gen_length // block_length
-    if steps <=0 or steps % num_blocks != 0:
-        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
-              f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
-        return
-    steps_per_block = steps // num_blocks
-    for num_block_iter in range(num_blocks):
-        current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
-        current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
-        block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
-        block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
-            (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
-        num_transfer_tokens_for_this_block = get_num_transfer_tokens(
-            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
-            steps_per_block
-        )
-        for i_step_in_block in range(steps_per_block):
-            mask_index_global = (x == MASK_ID)
-            if cfg_scale > 0.:
-                un_x = x.clone()
-                # For unconditional pass, mask out the original prompt tokens that are not padding
-                # raw_prompt_attention_mask is (B, prompt_len)
-                prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
-                un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
-                x_cfg_input = torch.cat([x, un_x], dim=0)
-                # Pass attention_mask for CFG if model expects it, covering both parts
-                # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
-                model_output = MODEL(x_cfg_input)
-                logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
-                logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
-            else:
-                # Not passing explicit attention_mask here; relies on model's internal handling.
-                model_output = MODEL(x)
-                logits = model_output.logits
-            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
-            x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
-            if remasking_strategy == 'low_confidence':
-                probs = F.softmax(logits.to(torch.float64), dim=-1)
-                x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
-            elif remasking_strategy == 'random':
-                x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
-            else:
-                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
-                return
-            confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
-            candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
-            confidence_for_selection = torch.where(
-                candidate_positions_for_unmasking,
-                x0_probs,
-                -torch.inf
-            )
-            x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
-            transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
-            num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
-            for j_batch_idx in range(batch_size):
-                k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
-                            candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
-                if k_val > 0:
-                    # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
-                    conf_slice = confidence_for_selection[j_batch_idx]
-                    if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
-                    # Check if there are enough valid (non -inf) confidences
-                    valid_conf_count = (conf_slice > -torch.inf).sum().item()
-                    actual_k = min(k_val, valid_conf_count)
-                    if actual_k > 0:
-                        _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
-                        transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
-            x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
-            current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
-            total_overall_steps = num_blocks * steps_per_block
-            status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
-            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
-    final_generated_ids = x[:, prompt_len:]
-    final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
-    final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
-    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
 @torch.no_grad()
 @spaces.GPU
@@ -460,177 +485,190 @@ def generate_viz_wrapper(uploaded_image_pil, prompt_text, steps, gen_length, blo
         yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
         return
-    steps = int(steps)
-    gen_length = int(gen_length)
-    block_length = int(block_length)
-    if thinking_mode_mmu:
-        prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
-    try:
-        m = [{"role": "user", "content": prompt_text}]
-        processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
-    except Exception as e:
-        yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
-        processed_prompt_text = prompt_text
-    image_vq_ids_tensor = None
-    if uploaded_image_pil is not None:
         try:
-            image = image_transform(uploaded_image_pil, resolution=512).to(DEVICE)
-            image = image.unsqueeze(0)
-            image_vq_ids_tensor = VQ_MODEL.get_code(image)  + 126349
         except Exception as e:
-            yield [("Error processing image.", "ERROR")], f"Image to VQ tokens conversion failed: {str(e)}"
-            return
-    try:
-        if TOKENIZER.pad_token_id is None:
-            if TOKENIZER.eos_token_id is not None:
-                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
-            else:
-                 yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
-                 return
-        input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
-        raw_prompt_attention_mask = None
-        if image_vq_ids_tensor is not None:
-            if image_vq_ids_tensor.ndim == 1:
-                image_vq_ids_tensor = image_vq_ids_tensor.unsqueeze(0)
-            input_ids = torch.cat([
-                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126089])).to(DEVICE),
-                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126084])).to(DEVICE),
-                image_vq_ids_tensor,
-                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126085])).to(DEVICE),
-                input_ids
-            ], dim=1).long()
-        else:
-            input_ids = input_ids
-    except Exception as e:
-        yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
-        return
-    batch_size = input_ids.shape[0]
-    prompt_len = input_ids.shape[1]
-    x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
-    x[:, :prompt_len] = input_ids.clone()
-    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
-    if gen_length == 0:
-         final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
-         yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
-         return
-    if block_length <= 0 or gen_length % block_length != 0 :
-        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
-              f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
-        return
-    num_blocks = gen_length // block_length
-    if steps <=0 or steps % num_blocks != 0:
-        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
-              f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
-        return
-    steps_per_block = steps // num_blocks
-    for num_block_iter in range(num_blocks):
-        current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
-        current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
-        block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
-        block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
-            (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
-        num_transfer_tokens_for_this_block = get_num_transfer_tokens(
-            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
-            steps_per_block
-        )
-        for i_step_in_block in range(steps_per_block):
-            mask_index_global = (x == MASK_ID)
-            if cfg_scale > 0.:
-                un_x = x.clone()
-                # For unconditional pass, mask out the original prompt tokens that are not padding
-                # raw_prompt_attention_mask is (B, prompt_len)
-                prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
-                un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
-                x_cfg_input = torch.cat([x, un_x], dim=0)
-                # Pass attention_mask for CFG if model expects it, covering both parts
-                # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
-                model_output = MODEL(x_cfg_input)
-                logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
-                logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
-            else:
-                # Not passing explicit attention_mask here; relies on model's internal handling.
-                model_output = MODEL(x)
-                logits = model_output.logits
-            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
-            x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
-            if remasking_strategy == 'low_confidence':
-                probs = F.softmax(logits.to(torch.float64), dim=-1)
-                x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
-            elif remasking_strategy == 'random':
-                x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
-            else:
-                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
-                return
-            confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
-            candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
-            confidence_for_selection = torch.where(
-                candidate_positions_for_unmasking,
-                x0_probs,
-                -torch.inf
-            )
-            x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
-            transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
-            num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
-            for j_batch_idx in range(batch_size):
-                k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
-                            candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
-                if k_val > 0:
-                    # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
-                    conf_slice = confidence_for_selection[j_batch_idx]
-                    if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
-                    # Check if there are enough valid (non -inf) confidences
-                    valid_conf_count = (conf_slice > -torch.inf).sum().item()
-                    actual_k = min(k_val, valid_conf_count)
-                    if actual_k > 0:
-                        _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
-                        transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
-            x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
-            current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
-            total_overall_steps = num_blocks * steps_per_block
-            status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
-            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
-    final_generated_ids = x[:, prompt_len:]
-    final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
-    final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
-    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
 css_styles = """
@@ -1025,8 +1063,8 @@ with gr.Blocks(css=css_styles, theme=theme) as demo:
         if VQ_MODEL is None:
             print("Loading VQ_MODEL for the first time...")
-            VQ_MODEL = MAGVITv2().from_pretrained("showlab/magvitv2").to(DEVICE)
-            print("VQ_MODEL loaded.")
         default_model_choice = "MMaDA-8B-MixCoT"

     TOKENIZER = AutoTokenizer.from_pretrained(model_path_to_load, trust_remote_code=True)
     status_msg_parts.append(f"Tokenizer for '{model_display_name_for_status}' loaded.")
+    MODEL = MMadaModelLM.from_pretrained(model_path_to_load, trust_remote_code=True, torch_dtype=torch.bfloat16).eval()
     status_msg_parts.append(f"Model '{model_display_name_for_status}' loaded to {DEVICE}.")
     uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
     if MODEL is None or TOKENIZER is None or MASK_ID is None:
         yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
         return
+    if DEVICE == 'cuda':
+        print("Moving MODEL to GPU for inference...")
+        MODEL.to(DEVICE)
+        VQ_MODEL.to(DEVICE)
+    try:
+        steps = int(steps)
+        guidance_scale = float(guidance_scale)
+        image_tokens = torch.ones((1, 1024), dtype=torch.long, device=DEVICE) * MASK_ID
+        prompt_text = [prompt_text]
+        input_ids, attention_mask = uni_prompting((prompt_text, image_tokens), 't2i_gen')
+        if guidance_scale > 0:
+            uncond_input_ids, uncond_attention_mask = uni_prompting(([''], image_tokens), 't2i_gen')
+        else:
+            uncond_input_ids, uncond_attention_mask = None, None
+        mask_schedule = get_mask_schedule(mask_schedule)
+        blank_image = Image.new("RGB", (512, 512), (255, 255, 255))
+        yield blank_image, "Starting generation..."
+        for image_step, status_msg_step in MODEL.t2i_generate_decoding_stepwise(
+                input_ids = input_ids,
+                uncond_input_ids = uncond_input_ids,
+                attention_mask = attention_mask,
+                uncond_attention_mask = uncond_attention_mask,
+                temperature=1.0,
+                timesteps = steps,
+                guidance_scale = guidance_scale,
+                noise_schedule = mask_schedule,
+                noise_type = "mask",
+                seq_len = 1024,
+                vq_model = VQ_MODEL,
+                uni_prompting=uni_prompting):
+            yield image_step, status_msg_step
+    finally:
+        if DEVICE == 'cuda':
+            print("Moving MODEL back to CPU...")
+            MODEL.to('cpu')
+            VQ_MODEL.to('cpu')
+            torch.cuda.empty_cache()
         yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
         return
+    if DEVICE == 'cuda':
+        print("Moving MODEL to GPU for inference...")
+        MODEL.to(DEVICE)
     try:
+        steps = int(steps)
+        gen_length = int(gen_length)
+        block_length = int(block_length)
+        if thinking_mode_lm:
+            prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
+        try:
+            m = [{"role": "user", "content": prompt_text}]
+            processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+        except Exception as e:
+            yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
+            processed_prompt_text = prompt_text
+        try:
+            if TOKENIZER.pad_token_id is None:
+                if TOKENIZER.eos_token_id is not None:
+                    TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+                else: # Should have been caught by load_model, but double check
+                    yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
+                    return
+            input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
+            raw_prompt_attention_mask = None
+        except Exception as e:
+            yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
+            return
+        batch_size = input_ids.shape[0]
+        prompt_len = input_ids.shape[1]
+        x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
+        x[:, :prompt_len] = input_ids.clone()
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
+        if gen_length == 0:
+            final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
+            return
+        if block_length <= 0 or gen_length % block_length != 0 :
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+                f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
+            return
+        num_blocks = gen_length // block_length
+        if steps <=0 or steps % num_blocks != 0:
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+                f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
+            return
+        steps_per_block = steps // num_blocks
+        for num_block_iter in range(num_blocks):
+            current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
+            current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
+            block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
+            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
+                (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
+            num_transfer_tokens_for_this_block = get_num_transfer_tokens(
+                block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
+                steps_per_block
+            )
+            for i_step_in_block in range(steps_per_block):
+                mask_index_global = (x == MASK_ID)
+                if cfg_scale > 0.:
+                    un_x = x.clone()
+                    # For unconditional pass, mask out the original prompt tokens that are not padding
+                    # raw_prompt_attention_mask is (B, prompt_len)
+                    prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
+                    un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
+                    x_cfg_input = torch.cat([x, un_x], dim=0)
+                    # Pass attention_mask for CFG if model expects it, covering both parts
+                    # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
+                    model_output = MODEL(x_cfg_input)
+                    logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
+                    logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
+                else:
+                    # Not passing explicit attention_mask here; relies on model's internal handling.
+                    model_output = MODEL(x)
+                    logits = model_output.logits
+                logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+                x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
+                if remasking_strategy == 'low_confidence':
+                    probs = F.softmax(logits.to(torch.float64), dim=-1)
+                    x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
+                elif remasking_strategy == 'random':
+                    x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
+                else:
+                    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
+                    return
+                confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
+                candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
+                confidence_for_selection = torch.where(
+                    candidate_positions_for_unmasking,
+                    x0_probs,
+                    -torch.inf
+                )
+                x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
+                transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
+                num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
+                for j_batch_idx in range(batch_size):
+                    k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
+                                candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
+                    if k_val > 0:
+                        # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
+                        conf_slice = confidence_for_selection[j_batch_idx]
+                        if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
+                        # Check if there are enough valid (non -inf) confidences
+                        valid_conf_count = (conf_slice > -torch.inf).sum().item()
+                        actual_k = min(k_val, valid_conf_count)
+                        if actual_k > 0:
+                            _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
+                            transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
+                x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
+                current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
+                total_overall_steps = num_blocks * steps_per_block
+                status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
+                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
+        final_generated_ids = x[:, prompt_len:]
+        final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
+        final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
+    finally:
+        if DEVICE == 'cuda':
+            print("Moving MODEL back to CPU and clearing cache...")
+            MODEL.to('cpu')
+            torch.cuda.empty_cache()
 @torch.no_grad()
 @spaces.GPU
         yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
         return
+    if DEVICE == 'cuda':
+        print("Moving MODEL to GPU for inference...")
+        MODEL.to(DEVICE)
+        VQ_MODEL.to(DEVICE)
+    try:
+        steps = int(steps)
+        gen_length = int(gen_length)
+        block_length = int(block_length)
+        if thinking_mode_mmu:
+            prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
         try:
+            m = [{"role": "user", "content": prompt_text}]
+            processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
         except Exception as e:
+            yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
+            processed_prompt_text = prompt_text
+        image_vq_ids_tensor = None
+        if uploaded_image_pil is not None:
+            try:
+                image = image_transform(uploaded_image_pil, resolution=512).to(DEVICE)
+                image = image.unsqueeze(0)
+                image_vq_ids_tensor = VQ_MODEL.get_code(image)  + 126349
+            except Exception as e:
+                yield [("Error processing image.", "ERROR")], f"Image to VQ tokens conversion failed: {str(e)}"
+                return
+        try:
+            if TOKENIZER.pad_token_id is None:
+                if TOKENIZER.eos_token_id is not None:
+                    TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+                else:
+                    yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
+                    return
+            input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
+            raw_prompt_attention_mask = None
+            if image_vq_ids_tensor is not None:
+                if image_vq_ids_tensor.ndim == 1:
+                    image_vq_ids_tensor = image_vq_ids_tensor.unsqueeze(0)
+                input_ids = torch.cat([
+                    (torch.ones(input_ids.shape[0], 1) * torch.tensor([126089])).to(DEVICE),
+                    (torch.ones(input_ids.shape[0], 1) * torch.tensor([126084])).to(DEVICE),
+                    image_vq_ids_tensor,
+                    (torch.ones(input_ids.shape[0], 1) * torch.tensor([126085])).to(DEVICE),
+                    input_ids
+                ], dim=1).long()
+            else:
+                input_ids = input_ids
+        except Exception as e:
+            yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
+            return
+        batch_size = input_ids.shape[0]
+        prompt_len = input_ids.shape[1]
+        x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
+        x[:, :prompt_len] = input_ids.clone()
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
+        if gen_length == 0:
+            final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
+            return
+        if block_length <= 0 or gen_length % block_length != 0 :
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+                f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
+            return
+        num_blocks = gen_length // block_length
+        if steps <=0 or steps % num_blocks != 0:
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+                f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
+            return
+        steps_per_block = steps // num_blocks
+        for num_block_iter in range(num_blocks):
+            current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
+            current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
+            block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
+            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
+                (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
+            num_transfer_tokens_for_this_block = get_num_transfer_tokens(
+                block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
+                steps_per_block
+            )
+            for i_step_in_block in range(steps_per_block):
+                mask_index_global = (x == MASK_ID)
+                if cfg_scale > 0.:
+                    un_x = x.clone()
+                    # For unconditional pass, mask out the original prompt tokens that are not padding
+                    # raw_prompt_attention_mask is (B, prompt_len)
+                    prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
+                    un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
+                    x_cfg_input = torch.cat([x, un_x], dim=0)
+                    # Pass attention_mask for CFG if model expects it, covering both parts
+                    # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
+                    model_output = MODEL(x_cfg_input)
+                    logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
+                    logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
+                else:
+                    # Not passing explicit attention_mask here; relies on model's internal handling.
+                    model_output = MODEL(x)
+                    logits = model_output.logits
+                logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+                x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
+                if remasking_strategy == 'low_confidence':
+                    probs = F.softmax(logits.to(torch.float64), dim=-1)
+                    x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
+                elif remasking_strategy == 'random':
+                    x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
+                else:
+                    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
+                    return
+                confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
+                candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
+                confidence_for_selection = torch.where(
+                    candidate_positions_for_unmasking,
+                    x0_probs,
+                    -torch.inf
+                )
+                x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
+                transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
+                num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
+                for j_batch_idx in range(batch_size):
+                    k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
+                                candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
+                    if k_val > 0:
+                        # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
+                        conf_slice = confidence_for_selection[j_batch_idx]
+                        if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
+                        # Check if there are enough valid (non -inf) confidences
+                        valid_conf_count = (conf_slice > -torch.inf).sum().item()
+                        actual_k = min(k_val, valid_conf_count)
+                        if actual_k > 0:
+                            _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
+                            transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
+                x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
+                current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
+                total_overall_steps = num_blocks * steps_per_block
+                status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
+                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
+        final_generated_ids = x[:, prompt_len:]
+        final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
+        final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
+    finally:
+        if DEVICE == 'cuda':
+            print("Moving MODEL back to CPU and clearing cache...")
+            MODEL.to('cpu')
+            VQ_MODEL.to('cpu')
+            torch.cuda.empty_cache()
 css_styles = """
         if VQ_MODEL is None:
             print("Loading VQ_MODEL for the first time...")
+            VQ_MODEL = MAGVITv2().from_pretrained("showlab/magvitv2")
+            print("VQ_MODEL loaded to CPU.")
         default_model_choice = "MMaDA-8B-MixCoT"