Spaces:

carminezacc
/

eruku

Runtime error

App Files Files Community

carminezacc commited on Nov 15, 2025

Commit

47e6fbb

verified ·

1 Parent(s): 96488d8

Upload eruku_continuous_inf.py with huggingface_hub

Browse files

Files changed (1) hide show

eruku_continuous_inf.py +569 -0

eruku_continuous_inf.py ADDED Viewed

	@@ -0,0 +1,569 @@

+import torch
+import os as _os
+from transformers import AutoTokenizer
+from transformers import T5ForConditionalGeneration, T5Config
+from custom_datasets import HFDataCollector
+from einops.layers.torch import Rearrange
+from einops import rearrange, repeat
+from torch.nn import MSELoss, CTCLoss, CrossEntropyLoss
+from pathlib import Path
+from torchvision.utils import make_grid, save_image
+from PIL import Image, ImageDraw, ImageFont
+from models.origami import OrigamiNet
+from diffusers import AutoencoderKL
+from torch.nn.utils.rnn import pad_sequence
+from torchvision.transforms import Normalize
+import numpy as np
+import torch.nn as nn
+from typing import Tuple
+# Safer defaults for clearer NCCL/CUDA error reporting during debugging
+_os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
+_os.environ.setdefault("TORCH_NCCL_BLOCKING_WAIT", "1")
+_os.environ.setdefault("TORCH_NCCL_ASYNC_ERROR_HANDLING", "1")
+def _safe_int_from_maybe_tensor(value, fallback_min: int = 64) -> int:
+    """Convert a python int or 0-dim tensor (cpu/cuda) to int safely.
+    - Synchronizes CUDA before .item() to surface the true failing kernel site
+    - Moves to CPU before scalarization
+    - Falls back to a reasonable minimum on unexpected errors
+    """
+    try:
+        if isinstance(value, torch.Tensor):
+            scalar_tensor = value
+            # Take the first element if tensor is not scalar
+            if scalar_tensor.dim() > 0:
+                scalar_tensor = scalar_tensor.reshape(-1)[0]
+            # Synchronize to attribute errors to the right op during debug
+            if scalar_tensor.is_cuda:
+                try:
+                    torch.cuda.synchronize(scalar_tensor.device)
+                except Exception:
+                    pass
+            return int(scalar_tensor.detach().to("cpu").item())
+        return int(value)
+    except Exception:
+        # As a last resort, return a conservative minimum width
+        return int(fallback_min)
+def pad_images(images, padding_value=1):
+    images = [rearrange(img, 'c h w -> w c h') for img in images]
+    padded = rearrange(pad_sequence(images, padding_value=padding_value), 'w b c h -> b c h w')
+    return padded.contiguous()
+# sog, eog, img
+SPECIAL_TOKEN_COUNT = 3
+class Emuru(torch.nn.Module):
+    def __init__(self, t5_checkpoint='google-t5/t5-base',
+                 vae_checkpoint='blowing-up-groundhogs/emuru_vae',
+                 ocr_checkpoint='files/checkpoints/Origami_bw_img/origami.pth', slices_per_query=1, channels=1, text_dropout_probability=0.0, img_dropout_probability=0.0):
+        super(Emuru, self).__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')  # per-character tokenizer
+        self.tokenizer.add_tokens(["<sog>"])
+        self.data_collator = HFDataCollector(tokenizer=self.tokenizer)
+        self.t5_name_or_path = t5_checkpoint
+        self.padding_token = torch.tensor([[-0.4951,  0.8021,  0.3429,  0.5622,  0.5271,  0.5756,  0.7194,  0.6150]])
+        self.padding_token_threshold = 0.484982096850872
+        config = T5Config.from_pretrained(t5_checkpoint)
+        config.vocab_size = len(self.tokenizer)
+        self.T5 = T5ForConditionalGeneration(config)
+        # Expose a HF-like config for downstream trainers expecting model.config
+        self.config = self.T5.config
+        # Ensure a valid identifier is present for downstream AutoProcessor lookups
+        try:
+            if not getattr(self.config, "_name_or_path", None):
+                self.config._name_or_path = str(self.t5_name_or_path)
+        except Exception:
+            # As a safe fallback, set attribute directly
+            self.config._name_or_path = str(self.t5_name_or_path)
+        self.T5.lm_head = torch.nn.Identity()
+        self.normalize = Normalize(0.5, 0.5)
+        self.sos = torch.nn.Embedding(1, config.d_model)
+        self.sog = torch.nn.Embedding(1, config.d_model)
+        self.eog = torch.nn.Embedding(1, config.d_model)
+        self.vae = AutoencoderKL.from_pretrained(vae_checkpoint)
+        vae_latent_dim = 8 # self.vae.config.get('latent_channels', 8)
+        self.query_emb = torch.nn.Linear(vae_latent_dim * channels * slices_per_query, config.d_model)
+        self.t5_to_vae = torch.nn.Linear(config.d_model, vae_latent_dim * channels * slices_per_query)
+        self.t5_to_special = torch.nn.Linear(config.d_model, SPECIAL_TOKEN_COUNT)
+        self.t5_to_ocr = torch.nn.Linear(config.d_model, len(self.tokenizer), bias=False)
+        self.uncond_embedding = torch.nn.Embedding(1, config.d_model)
+        self.dropout_probability = 0.0
+        self.drop_text = False
+        self.drop_img = False
+        self.set_training(self.vae, False)
+        self.ocr = OrigamiNet.from_checkpoint(ocr_checkpoint, o_classes=165, n_channels=1)
+        self.set_training(self.ocr, False)
+        self.query_rearrange = Rearrange('b c h (w q) -> b w (q c h)', q=slices_per_query)
+        self.special_rearrange = torch.nn.Identity()
+        # self.special_rearrange = Rearrange('b w (h c) -> b w (h c)')
+        self.z_rearrange = Rearrange('b w (q c h) -> b c h (w q)', c=channels, q=slices_per_query)
+        self.z_rearrange_eval = Rearrange('w b (q c h) -> b c h (w q)', c=channels, q=slices_per_query)
+        self.mse_criterion = MSELoss()#(reduction='none') # TODO:change reductions if you intend to add a mask
+        self.ce_criterion = CrossEntropyLoss()
+        # self.ctc_criterion = CTCLoss()
+        self.trainer = None
+        self.alpha = 1.0
+        # Minimal attributes for TRL compatibility
+        self.warnings_issued = {}
+        self._model_tags = set()
+    def add_model_tags(self, tags):
+        try:
+            if isinstance(tags, (list, tuple, set)):
+                self._model_tags.update(tags)
+            elif isinstance(tags, str):
+                self._model_tags.add(tags)
+        except Exception:
+            # No-op if tags updating fails
+            pass
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        """Enable gradient checkpointing - delegate to T5 model"""
+        if hasattr(self.T5, 'gradient_checkpointing_enable'):
+            self.T5.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+    def gradient_checkpointing_disable(self):
+        """Disable gradient checkpointing - delegate to T5 model"""
+        if hasattr(self.T5, 'gradient_checkpointing_disable'):
+            self.T5.gradient_checkpointing_disable()
+    def set_training(self, model, training):
+        model.train() if training else model.eval()
+        for param in model.parameters():
+            param.requires_grad = training
+    def _img_encode(self,img):
+        img = self.normalize(img)
+        # Ensure contiguous memory layout before encode to avoid kernel issues
+        img = img.contiguous()
+        return self.vae.encode(img.float()).latent_dist.sample()
+    @torch.no_grad()
+    def get_model_inputs(self, style_img, gen_img, style_len, gen_len, max_img_len):
+        bs = len(style_img)
+        decoder_inputs_embeds_list = []
+        specials_list = []
+        # Move images to device and pad them
+        style_img = pad_images([el.to(self.T5.device) for el in style_img])
+        if gen_img is not None:
+            gen_img = pad_images([el.to(self.T5.device) for el in gen_img])
+            gen_img_embeds = self._img_encode(gen_img)
+        else:
+            gen_img_embeds = None
+        style_img_embeds = self._img_encode(style_img)
+        for el in range(bs):
+            if isinstance(style_len, int):
+                sl = style_len
+            else:
+                # Safely get scalar style length
+                sl_tensor = style_len[el] if hasattr(style_len, '__getitem__') else style_len
+                sl = _safe_int_from_maybe_tensor(sl_tensor)
+            # Ensure widths are within bounds
+            sl = max(64, min(sl, style_img_embeds.shape[-1]))
+            # Start with style image embeds
+            sample_embeds_parts = [style_img_embeds[el,:,:,:sl//8]]
+            specials_parts = [torch.ones(sl//8) * 2] # Img token
+            if gen_img_embeds is not None and gen_len is not None:
+                if isinstance(gen_len, int):
+                    gl = gen_len
+                else:
+                    gl_tensor = gen_len[el] if hasattr(gen_len, '__getitem__') else gen_len
+                    gl = _safe_int_from_maybe_tensor(gl_tensor)
+                gl = max(64, min(gl, gen_img_embeds.shape[-1]))
+                sample_embeds_parts.extend([
+                    torch.ones(1, 8, 1).to(self.T5.device), # SOG token placeholder
+                    gen_img_embeds[el,:,:,:gl//8],
+                    torch.ones(1, 8, 1).to(self.T5.device), # EOG token placeholder
+                ])
+                specials_parts.extend([
+                    torch.zeros(1), # SOG
+                    torch.ones(gl//8) * 2, # Img
+                    torch.ones(1), # EOG
+                ])
+            sample_embeds = torch.cat(sample_embeds_parts, dim=-1)
+            h_dim = sample_embeds.shape[1]
+            sample_embeds = rearrange(sample_embeds, 'c h w -> w (h c)', h=h_dim, c=1)
+            decoder_inputs_embeds_list.append(sample_embeds)
+            sample_specials = torch.cat(specials_parts, dim=0).to(self.T5.device)
+            specials_list.append(sample_specials)
+        # Pad sequences and ensure consistent shapes
+        decoder_inputs_embeds_padded = pad_sequence(decoder_inputs_embeds_list, padding_value=1, batch_first=True)
+        specials_padded = pad_sequence(specials_list, padding_value=1, batch_first=True)
+        # Ensure we don't exceed max_img_len
+        max_seq_len = max_img_len // 8
+        if decoder_inputs_embeds_padded.size(1) > max_seq_len:
+            decoder_inputs_embeds_padded = decoder_inputs_embeds_padded[:, :max_seq_len]
+        if specials_padded.size(1) > max_seq_len:
+            specials_padded = specials_padded[:, :max_seq_len]
+        return {
+            'decoder_inputs_embeds': decoder_inputs_embeds_padded,
+            'specials': specials_padded.long(),
+        }
+    def forward(self, decoder_inputs_embeds_vae, specials, style_text, gen_text, ce_multiplier=1.0):
+        # style_img_embeds: [bs, w//8, 8, 1]
+        # generate text embeddings
+        with torch.no_grad():
+            encoded_text = self.tokenizer([f"{style}<sog>{gen}" for style, gen in zip(style_text, gen_text)], padding=True, return_tensors="pt")
+        # add special tokens to img
+        sos = repeat(self.sos.weight, '1 d -> b 1 d', b=decoder_inputs_embeds_vae.size(0))
+        sog = repeat(self.sog.weight, '1 d -> b d', b=decoder_inputs_embeds_vae.size(0))
+        # eog = repeat(self.eog.weight, '1 d -> b d', b=decoder_inputs_embeds_vae.size(0))
+        decoder_inputs_embeds = self.query_emb(decoder_inputs_embeds_vae)
+        # Fix the indexing assignment to avoid shape mismatch
+        sog_mask = (specials == 0)
+        eog_mask = (specials == 1)
+        # Expand sog to match the sequence dimension
+        sog_expanded = sog.unsqueeze(1).expand(-1, decoder_inputs_embeds.size(1), -1)
+        if sog_mask.any():
+            decoder_inputs_embeds[sog_mask] = sog_expanded[sog_mask]
+        if eog_mask.any():
+            # Expand eog to match the sequence dimension
+            eog_expanded = self.eog.weight.unsqueeze(0).expand(decoder_inputs_embeds.size(0), decoder_inputs_embeds.size(1), -1)
+            decoder_inputs_embeds[eog_mask] = eog_expanded[eog_mask]
+        decoder_inputs_embeds = torch.cat(
+            [
+                sos,
+                decoder_inputs_embeds
+            ], dim = 1,
+        )
+        inputs_embeds = self.T5.shared(encoded_text['input_ids'].to(self.T5.device))
+        drop_ids = torch.rand(inputs_embeds.shape[0], device=inputs_embeds.device) < self.dropout_probability
+        if self.drop_text:
+            inputs_embeds = torch.where(drop_ids[:, None, None], self.uncond_embedding.weight, inputs_embeds)
+        if self.drop_img:
+            decoder_inputs_embeds = torch.where(drop_ids[:, None, None], self.uncond_embedding.weight, decoder_inputs_embeds)
+        output = self.T5(inputs_embeds=inputs_embeds, attention_mask=encoded_text['attention_mask'].to(self.T5.device), decoder_inputs_embeds=decoder_inputs_embeds)
+        vae_latent = self.t5_to_vae(output.logits[:, :-1])
+        special_latent = self.t5_to_special(output.logits[:, :-1]) # [bs, w//8, 3]
+        pred_latent = self.z_rearrange(vae_latent)
+        special_pred = self.special_rearrange(special_latent)
+        ce_loss = ce_multiplier * self.ce_criterion(special_pred.flatten(0,1), specials.flatten(0,1))
+        mse_mask = (specials == 2).unsqueeze(2) # [bs, w//8] TODO:consider putting the mask back in
+        gt = decoder_inputs_embeds_vae * mse_mask
+        vae_latent = vae_latent * mse_mask
+        mse_loss = self.mse_criterion(vae_latent, gt)#/mse_mask.sum()
+        ocr_loss = 0
+        if self.alpha < 1.0:
+            pred_img = self.vae.decode(pred_latent).sample
+            gt_img = self.vae.decode(decoder_inputs_embeds_vae.unsqueeze(1)).sample
+            ocr_preds = self.ocr(pred_img)
+            ocr_gt = self.ocr(gt_img)
+            ocr_loss = self.mse_criterion(ocr_preds, ocr_gt)
+        else:
+            ocr_loss = torch.tensor(0.0).to(mse_loss.device)
+        loss = (ce_loss + mse_loss) * self.alpha + ocr_loss * (1 - self.alpha)
+        return {'loss': loss, 'mse_loss': mse_loss, 'ce_loss': ce_loss, 'ocr_loss': ocr_loss}, pred_latent
+    def split_characters(self, pred, gt, indices):
+        pred = self.vae.decode(pred).sample
+        gt = self.vae.decode(gt).sample
+        img = torch.cat([gt, pred], dim=-2)
+        curr_char = indices[0]
+        for idx, char in enumerate(indices):
+            if char != curr_char:
+                img[:, :, :, idx * 8 - 1] = -1
+                curr_char = char
+        img = self.write_text_below_image(img, self.tokenizer.decode(indices))
+        return img
+    @torch.no_grad()
+    def write_text_below_image(self, image, text):
+        image = (torch.clamp(image, -1, 1) + 1) * 127.5
+        image = rearrange(image.to(torch.uint8), '1 1 h w -> h w').cpu().numpy()
+        image = Image.fromarray(image, mode='L')
+        text = text.replace('<pad>', '#').replace('</s>', '$')
+        # Load the font
+        font = ImageFont.load_default()
+        ascent, descent = font.getmetrics()
+        (width, baseline), (offset_x, offset_y) = font.font.getsize(text)
+        # Calculate dimensions for the new image
+        img_width, img_height = image.size
+        new_height = img_height + offset_y + ascent +descent
+        # Create a new image with white background
+        new_image = Image.new('L', (img_width, new_height), color='white')
+        # Paste the original image onto the new image
+        new_image.paste(image, (0, 0))
+        # Draw the text onto the new image
+        draw = ImageDraw.Draw(new_image)
+        curr_char = None
+        for idx, char in enumerate(text):
+            if char != curr_char:
+                curr_char = char
+                draw.text((idx * 8, img_height), char, fill='black', font=font)
+        return new_image
+    @torch.inference_mode()
+    def generate(self, decoder_inputs_embeds_vae, style_text, gen_text, cfg_scale=1.0, max_new_tokens=64):
+        """
+        call this with bs=1 please
+        """
+        encoded_text = self.tokenizer([f"{style}<sog>{gen}" for style, gen in zip(style_text,gen_text)], padding=True, return_tensors="pt")
+        text_input_ids = encoded_text['input_ids'].to(self.T5.device)
+        text_mask = encoded_text['attention_mask'].to(self.T5.device)
+        sog = repeat(self.sog.weight, '1 d -> b 1 d', b=1)
+        sos = repeat(self.sos.weight, '1 d -> b 1 d', b=1)
+        z_sequence = [decoder_inputs_embeds_vae]
+        special_sequence = torch.ones(decoder_inputs_embeds_vae.size(1))*3
+        if len(z_sequence) == 0:
+            decoder_inputs_embeds = sos
+        else:
+            decoder_inputs_embeds = self.query_emb(torch.cat(z_sequence, dim=1))
+            if len(style_text[0]) != 0:
+                decoder_inputs_embeds = torch.cat([sos, decoder_inputs_embeds], dim=1)
+            else:
+                decoder_inputs_embeds = torch.cat([sos, decoder_inputs_embeds, sog], dim=1)
+                vae_latent = self.t5_to_vae(sog)
+                special_sequence = torch.cat([special_sequence, torch.zeros(1)])
+                z_sequence.append(vae_latent)
+        for i in range(max_new_tokens):
+            if cfg_scale != 1.0:
+                conditional_text_embeds = self.T5.shared(text_input_ids)
+                if self.drop_text:
+                    unconditional_text_embeds = torch.zeros_like(conditional_text_embeds).to(self.T5.device) + self.uncond_embedding.weight
+                else:
+                    unconditional_text_embeds = conditional_text_embeds
+                if self.drop_img:
+                    unconditional_decoder_inputs_embeds = torch.zeros_like(decoder_inputs_embeds).to(self.T5.device) + self.uncond_embedding.weight
+                else:
+                    unconditional_decoder_inputs_embeds = decoder_inputs_embeds
+                output_unconditional = self.T5(inputs_embeds=unconditional_text_embeds, attention_mask=text_mask, decoder_inputs_embeds=unconditional_decoder_inputs_embeds).logits[:, -1:]
+                output_conditional = self.T5(input_ids=text_input_ids, attention_mask=text_mask, decoder_inputs_embeds=decoder_inputs_embeds).logits[:, -1:]
+                output = output_unconditional + (output_conditional - output_unconditional) * cfg_scale
+            else:
+                output = self.T5(input_ids=text_input_ids, attention_mask=text_mask, decoder_inputs_embeds=decoder_inputs_embeds).logits[:, -1:]
+            special_prediction = self.t5_to_special(output)
+            if torch.argmax(special_prediction, dim=-1) == 0:
+                decoder_inputs_embeds = torch.cat([decoder_inputs_embeds, sog], dim=1)
+                vae_latent = self.t5_to_vae(output)
+                special_sequence = torch.cat([special_sequence, torch.zeros(1)])
+            elif torch.argmax(special_prediction, dim=-1) == 1:
+                special_sequence = torch.cat([special_sequence, torch.ones(1)])
+                vae_latent = self.t5_to_vae(output)
+                z_sequence.append(vae_latent)
+                break
+            else:
+                vae_latent = self.t5_to_vae(output)
+                decoder_inputs_embeds = torch.cat([decoder_inputs_embeds, self.query_emb(vae_latent)], dim=1)
+                special_sequence = torch.cat([special_sequence, torch.ones(1)*2])
+            z_sequence.append(vae_latent)
+        z_sequence = [el.to(self.vae.device) for el in z_sequence]
+        z_sequence = torch.cat(z_sequence, dim=1)
+        img = torch.clamp(self.vae.decode(self.z_rearrange(z_sequence)).sample, -1, 1)
+        return img, special_sequence.to(self.T5.device)
+    @torch.no_grad()
+    def continue_gen_test(self, gt, batch, max_new_tokens=64, cfg_scale=1.0):
+        gt = gt[:1]
+        def _continue_gen(style_len):
+            generation = self.generate(batch['decoder_inputs_embeds'][:1, :style_len], batch['style_text'][:1], batch['gen_text'][:1], cfg_scale=cfg_scale, max_new_tokens=max_new_tokens)
+            test_img = generation[0]
+            special_sequence = generation[1].repeat_interleave(8)
+            special_img = torch.zeros_like(test_img).repeat(1,3,1,1)
+            special_sequence = special_sequence[:special_img.size(-1)]
+            special_img[:,0,:,special_sequence == 2] = 1 # red: image
+            special_img[:,1,:,special_sequence == 0] = 1 # green: sog
+            special_img[:,2,:,special_sequence == 1] = 1 # blue: eog
+            try:
+                test_img[:, :, :, style_len * 8] = -1  # add a black line between style and pred
+            except:
+                print("couldn't add black line")
+                # add special_img to the bottom of test_img
+            test_img = torch.cat([test_img.repeat(1,3,1,1) , special_img], dim=-2)
+            return test_img
+        gt = torch.clamp(self.vae.decode(gt).sample, -1, 1)
+        if type(batch['style_img_width']) == torch.Tensor:
+            style_img_width = batch['style_img_width'][0]
+        else:
+            style_img_width = batch['style_img_width']
+        return torch.cat(list(pad_images([
+            # make_grid(_continue_gen(style_img_width//8-10), nrow=1, normalize=True),
+            make_grid(_continue_gen(style_img_width//8), nrow=1, normalize=True),
+        ])), dim=-2)
+    def save_pretrained(self, path):
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        torch.save(self.T5.state_dict(), path / 'T5.pth')
+        torch.save(self.vae.state_dict(), path / 'VAE.pth')
+        torch.save(self.ocr.state_dict(), path / 'OCR.pth')
+        torch.save(self.query_emb.state_dict(), path / 'query_emb.pth')
+        torch.save(self.sos.state_dict(), path / 'sos.pth')
+    def load_pretrained(self, path):
+        path = Path(path)
+        self.T5.load_state_dict(torch.load(path / 'T5.pth'))
+        self.vae.load_state_dict(torch.load(path / 'VAE.pth'))
+        self.ocr.load_state_dict(torch.load(path / 'OCR.pth'))
+        self.query_emb.load_state_dict(torch.load(path / 'query_emb.pth'))
+        self.sos.load_state_dict(torch.load(path / 'sos.pth'))
+class DDPCompatibleEmuru(Emuru):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, batch_data, mode='train'):
+        """
+        Unified forward method that handles different modes for DDP compatibility
+        """
+        if mode == 'train':
+            # Training mode - expects the full batch with model inputs already computed
+            return super().forward(
+                batch_data['decoder_inputs_embeds'],
+                batch_data['specials'],
+                batch_data['style_text'],
+                batch_data['gen_text']
+            )
+        elif mode == 'get_model_inputs':
+            # Mode to get model inputs
+            return super().get_model_inputs(
+                batch_data['style_img'],
+                batch_data['gen_img'],
+                batch_data['style_img_width'],
+                batch_data['gen_img_width'],
+                batch_data['max_img_len']
+            )
+        elif mode == 'generate':
+            # Generation mode
+            return super().generate(
+                batch_data['decoder_inputs_embeds_vae'],
+                batch_data['style_text'],
+                batch_data['gen_text'],
+                batch_data.get('cfg_scale', 1.0),
+                batch_data.get('max_new_tokens', 64)
+            )
+        elif mode == 'continue_gen_test':
+            # Continue generation test mode
+            return super().continue_gen_test(
+                batch_data['gt'],
+                batch_data['batch'],
+                batch_data.get('cfg_scale', 1.0),
+                batch_data.get('max_new_tokens', 64)
+            )
+        else:
+            raise ValueError(f"Unknown mode: {mode}")
+    def module_get_model_inputs(self, style_img, gen_img, style_len, gen_len, max_img_len):
+        """Direct access method for get_model_inputs when not using DDP forward"""
+        return super().get_model_inputs(style_img, gen_img, style_len, gen_len, max_img_len)
+    def module_continue_gen_test(self, gt, batch, max_new_tokens=64, cfg_scale=1.0):
+        """Direct access method for continue_gen_test when not using DDP forward"""
+        return super().continue_gen_test(gt, batch, max_new_tokens, cfg_scale)
+    def module_vae_decode(self, latents):
+        """Direct access method for VAE decode"""
+        return self.vae.decode(latents)
+    def get_trainable_parameters(self):
+        """
+        Get only the parameters that have requires_grad=True
+        Useful for creating optimizers with only trainable parameters
+        """
+        return [p for p in self.parameters() if p.requires_grad]
+    def get_parameter_count(self):
+        """
+        Get counts of total and trainable parameters
+        """
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return {
+            'total_parameters': total_params,
+            'trainable_parameters': trainable_params,
+            'frozen_parameters': total_params - trainable_params
+        }
+    def print_parameter_info(self):
+        """
+        Print detailed information about model parameters
+        """
+        info = self.get_parameter_count()
+        print(f"Model Parameter Info:")
+        print(f"  Total parameters: {info['total_parameters']:,}")
+        print(f"  Trainable parameters: {info['trainable_parameters']:,}")
+        print(f"  Frozen parameters: {info['frozen_parameters']:,}")
+        print(f"  Trainable ratio: {info['trainable_parameters']/info['total_parameters']:.2%}")
+        # Print per-module info
+        print(f"\nPer-module breakdown:")
+        for name, module in self.named_children():
+            module_total = sum(p.numel() for p in module.parameters())
+            module_trainable = sum(p.numel() for p in module.parameters() if p.requires_grad)
+            if module_total > 0:
+                print(f"  {name}: {module_trainable:,}/{module_total:,} trainable ({module_trainable/module_total:.1%})")