Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

pixeldit/__init__.py +17 -0
pixeldit/configuration_pixeldit.py +43 -0
pixeldit/modeling_pixeldit.py +47 -0
pixeldit/modeling_pixeldit_hf.py +75 -0
pixeldit/pipeline.py +81 -0
pixeldit/scheduling_flow.py +65 -0
pixeldit/text_encoder_gemma.py +79 -0
pixeldit/text_encoder_qwen.py +72 -0

pixeldit/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .pipeline import PixelDiTPipeline
+from .modeling_pixeldit import load_pixeldit
+from .modeling_pixeldit_hf import PixelDiTModel
+from .configuration_pixeldit import PixelDiTConfig
+from .text_encoder_gemma import GemmaEncoder
+from .text_encoder_qwen import QwenEncoder
+from .scheduling_flow import FlowScheduler
+__all__ = [
+    "PixelDiTPipeline",
+    "load_pixeldit",
+    "PixelDiTModel",
+    "PixelDiTConfig",
+    "GemmaEncoder",
+    "QwenEncoder",
+    "FlowScheduler",
+]

pixeldit/configuration_pixeldit.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from transformers import PretrainedConfig
+class PixelDiTConfig(PretrainedConfig):
+    model_type = "pixeldit"
+    def __init__(
+        self,
+        in_channels=3,
+        num_groups=24,
+        hidden_size=1536,
+        pixel_hidden_size=16,
+        pixel_attn_hidden_size=1152,
+        pixel_num_groups=16,
+        patch_depth=14,
+        pixel_depth=2,
+        num_text_blocks=4,
+        patch_size=16,
+        txt_embed_dim=2304,
+        txt_max_length=300,
+        use_text_rope=True,
+        text_rope_theta=10000.0,
+        repa_encoder_index=-1,
+        use_pixel_abs_pos=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.in_channels            = in_channels
+        self.num_groups             = num_groups
+        self.hidden_size            = hidden_size
+        self.pixel_hidden_size      = pixel_hidden_size
+        self.pixel_attn_hidden_size = pixel_attn_hidden_size
+        self.pixel_num_groups       = pixel_num_groups
+        self.patch_depth            = patch_depth
+        self.pixel_depth            = pixel_depth
+        self.num_text_blocks        = num_text_blocks
+        self.patch_size             = patch_size
+        self.txt_embed_dim          = txt_embed_dim
+        self.txt_max_length         = txt_max_length
+        self.use_text_rope          = use_text_rope
+        self.text_rope_theta        = text_rope_theta
+        self.repa_encoder_index     = repa_encoder_index
+        self.use_pixel_abs_pos      = use_pixel_abs_pos

pixeldit/modeling_pixeldit.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+PixelDiT model loader.
+Usage:
+    from modeling_pixeldit import load_pixeldit
+    model = load_pixeldit()
+    out = model(x, t, y)   # [B,3,H,W], [B], [B,300,2304] -> [B,3,H,W]
+"""
+import sys
+import torch
+sys.path.insert(0, "/home/nobus/Raid0/PixelDiT")
+from pixdit_core.pixeldit_t2i import PixDiT_T2I
+_CKPT = (
+    "/home/nobus/.cache/huggingface/hub/"
+    "models--nvidia--PixelDiT-1300M-1024px/snapshots/"
+    "7c63b99a7a399918a1d6478b095698a65f664847/pixeldit_t2i_v1.pth"
+)
+_ARCH = dict(
+    in_channels=3,
+    num_groups=24,
+    hidden_size=1536,
+    pixel_hidden_size=16,
+    pixel_attn_hidden_size=1152,
+    pixel_num_groups=16,
+    patch_depth=14,
+    pixel_depth=2,
+    patch_size=16,
+    txt_embed_dim=2304,
+    txt_max_length=300,
+)
+def load_pixeldit(checkpoint=_CKPT, device="cuda", dtype=torch.bfloat16):
+    model = PixDiT_T2I(**_ARCH)
+    state = torch.load(checkpoint, map_location="cpu", weights_only=False)
+    sd = state.get("state_dict", state)
+    sd = {(k[5:] if k.startswith("core.") else k): v for k, v in sd.items()}
+    missing, _ = model.load_state_dict(sd, strict=False)
+    if missing:
+        print(f"[modeling] {len(missing)} missing keys (expected)")
+    model = model.to(device).to(dtype).eval()
+    print(f"[modeling] PixelDiT loaded — {sum(p.numel() for p in model.parameters()):,} params")
+    return model

pixeldit/modeling_pixeldit_hf.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+HF-compatible PixelDiT wrapper.
+Allows save_pretrained / from_pretrained and peft LoRA targeting.
+Usage:
+    # Convert from original .pth
+    model = PixelDiTModel.from_pth("pixeldit_t2i_v1.pth")
+    model.save_pretrained("pixeldit-diffusers/")
+    # Load back
+    model = PixelDiTModel.from_pretrained("pixeldit-diffusers/")
+    # LoRA
+    from peft import get_peft_model, LoraConfig
+    lora_cfg = LoraConfig(target_modules=["qkv_x", "qkv_y", "proj_x", "proj_y"])
+    model = get_peft_model(model, lora_cfg)
+"""
+import sys
+import torch
+from transformers import PreTrainedModel
+sys.path.insert(0, "/home/nobus/Raid0/PixelDiT")
+from pixdit_core.pixeldit_t2i import PixDiT_T2I
+from .configuration_pixeldit import PixelDiTConfig
+class PixelDiTModel(PreTrainedModel):
+    config_class = PixelDiTConfig
+    _tied_weights_keys = []
+    @property
+    def all_tied_weights_keys(self):
+        return {}
+    def __init__(self, config: PixelDiTConfig):
+        super().__init__(config)
+        self.model = PixDiT_T2I(
+            in_channels            = config.in_channels,
+            num_groups             = config.num_groups,
+            hidden_size            = config.hidden_size,
+            pixel_hidden_size      = config.pixel_hidden_size,
+            pixel_attn_hidden_size = config.pixel_attn_hidden_size,
+            pixel_num_groups       = config.pixel_num_groups,
+            patch_depth            = config.patch_depth,
+            pixel_depth            = config.pixel_depth,
+            num_text_blocks        = config.num_text_blocks,
+            patch_size             = config.patch_size,
+            txt_embed_dim          = config.txt_embed_dim,
+            txt_max_length         = config.txt_max_length,
+            use_text_rope          = config.use_text_rope,
+            text_rope_theta        = config.text_rope_theta,
+            repa_encoder_index     = config.repa_encoder_index,
+            use_pixel_abs_pos      = config.use_pixel_abs_pos,
+        )
+    def forward(self, x, t, y, s=None, mask=None):
+        return self.model(x, t, y, s=s, mask=mask)
+    @classmethod
+    def from_pth(cls, pth_path: str, config: PixelDiTConfig = None):
+        """Load from original nvidia .pth checkpoint, handles core. prefix."""
+        if config is None:
+            config = PixelDiTConfig()
+        model = cls(config)
+        state = torch.load(pth_path, map_location="cpu", weights_only=False)
+        sd = state.get("state_dict", state)
+        # strip trainer wrapper prefix, then add HF model. prefix
+        sd = {(k[5:] if k.startswith("core.") else k): v for k, v in sd.items()}
+        sd = {"model." + k: v for k, v in sd.items()}
+        missing, unexpected = model.load_state_dict(sd, strict=False)
+        print(f"[PixelDiTModel.from_pth] loaded — {len(missing)} missing, {len(unexpected)} unexpected")
+        return model

pixeldit/pipeline.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+PixelDiT T2I Pipeline — thin orchestrator.
+Usage:
+    from pipeline import PixelDiTPipeline
+    pipe = PixelDiTPipeline()
+    images = pipe("a viking at sunset", height=512, width=512)
+    images[0].save("out.jpg")
+"""
+import torch
+from PIL import Image
+from .modeling_pixeldit import load_pixeldit
+from .modeling_pixeldit_hf import PixelDiTModel
+from .text_encoder_gemma import GemmaEncoder
+from .text_encoder_qwen import QwenEncoder
+from .scheduling_flow import FlowScheduler
+class PixelDiTPipeline:
+    def __init__(
+        self,
+        text_encoder="gemma",   # "gemma" | "qwen"
+        qwen_proj=None,
+        device="cuda",
+        dtype=torch.bfloat16,
+        cfg=3.5,
+        flow_shift=4.0,
+        pretrained=None,        # HF dir or repo id — loads via from_pretrained instead of .pth
+    ):
+        self.device = torch.device(device)
+        self.dtype  = dtype
+        if text_encoder == "qwen":
+            self.encoder = QwenEncoder(proj_path=qwen_proj, output_device=device, output_dtype=dtype)
+        else:
+            self.encoder = GemmaEncoder(output_device=device, output_dtype=dtype)
+        if pretrained is not None:
+            print(f"[pipeline] loading from HF: {pretrained}")
+            self.model = (
+                PixelDiTModel.from_pretrained(pretrained)
+                .to(device).to(dtype).eval()
+            )
+        else:
+            self.model = load_pixeldit(device=device, dtype=dtype)
+        self.scheduler = FlowScheduler(self.model, cfg=cfg, flow_shift=flow_shift)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        negative_prompt="",
+        height=512,
+        width=512,
+        steps=20,
+        cfg=None,
+        seed=None,
+    ):
+        if isinstance(prompt, str):
+            prompts = [prompt]
+        else:
+            prompts = list(prompt)
+        B = len(prompts)
+        if cfg is not None:
+            self.scheduler.cfg = cfg
+        if seed is not None:
+            torch.manual_seed(seed)
+        cond   = self.encoder.encode(prompts)
+        uncond = (self.encoder.encode_null(B) if not negative_prompt
+                  else self.encoder.encode([negative_prompt] * B))
+        noise = torch.randn(B, 3, height, width, device=self.device, dtype=self.dtype)
+        imgs  = self.scheduler.sample(noise, cond, uncond, steps=steps)
+        imgs = (imgs.clamp(-1, 1) + 1) / 2
+        imgs = (imgs * 255).byte().permute(0, 2, 3, 1).cpu().numpy()
+        return [Image.fromarray(img) for img in imgs]

pixeldit/scheduling_flow.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Flow-matching DPM-Solver++ sampler for PixelDiT.
+Wraps the original DPMS from the PixelDiT repo.
+Order=2 multistep gets quality at 20 steps that Euler needs 100+ for.
+Usage:
+    from scheduling_flow import FlowScheduler
+    scheduler = FlowScheduler(model_fn, cfg=3.5, flow_shift=4.0)
+    image = scheduler.sample(noise, cond, uncond, steps=20)
+"""
+import sys
+import torch
+from tqdm import tqdm
+sys.path.insert(0, "/home/nobus/Raid0/PixelDiT/t2i")
+from diffusion.model.flow_dpm import DPMS
+_FLOW_SHIFT = 4.0   # 1024px stage-3 config
+class FlowScheduler:
+    def __init__(self, model_fn, cfg=3.5, flow_shift=_FLOW_SHIFT):
+        """
+        model_fn: callable(x, t, y) -> velocity  [B,3,H,W]
+        cfg:      classifier-free guidance scale
+        """
+        # DPMS passes y as [B,1,L,D] but PixDiT_T2I expects [B,L,D] — squeeze here
+        self.model_fn   = lambda x, t, y: model_fn(x, t, y.squeeze(1) if y.dim() == 4 else y)
+        self.cfg        = cfg
+        self.flow_shift = flow_shift
+    @torch.no_grad()
+    def sample(
+        self,
+        noise: torch.Tensor,       # [B, 3, H, W] Gaussian noise
+        cond:  torch.Tensor,       # [B, 300, 2304]
+        uncond: torch.Tensor,      # [B, 300, 2304]
+        steps: int = 20,
+    ) -> torch.Tensor:
+        """Returns denoised image tensor [B, 3, H, W] in [-1, 1]."""
+        # DPMS expects [B, 1, L, D]
+        cond_4d   = cond.unsqueeze(1)
+        uncond_4d = uncond.unsqueeze(1)
+        dpm = DPMS(
+            self.model_fn,
+            condition=cond_4d,
+            uncondition=uncond_4d,
+            cfg_scale=self.cfg,
+            model_type="flow",
+            schedule="FLOW",
+            guidance_type="classifier-free",
+            interval_guidance=[0, 1],
+        )
+        return dpm.sample(
+            noise,
+            steps=steps,
+            order=2,
+            skip_type="time_uniform_flow",
+            method="multistep",
+            flow_shift=self.flow_shift,
+        )

pixeldit/text_encoder_gemma.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Gemma-2-2B text encoder for PixelDiT.
+Handles chi_prompt prefix + select_index to match training exactly.
+Usage:
+    from pixeldit.text_encoder_gemma import GemmaEncoder
+    enc  = GemmaEncoder()
+    cond = enc.encode(["a dragon at sunset"])  # [1, 300, 2304]
+    null = enc.encode_null(1)                  # [1, 300, 2304]
+"""
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+_GEMMA_ID  = "Efficient-Large-Model/gemma-2-2b-it"
+_GEMMA_DIM = 2304
+_TXT_MAX   = 300
+_CHI_PROMPT = "\n".join([
+    'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:',
+    '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.',
+    '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.',
+    'Here are examples of how to transform or refine prompts:',
+    '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.',
+    '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.',
+    'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:',
+    'User Prompt: ',
+])
+_SELECT_IDX = [0] + list(range(-(_TXT_MAX - 1), 0))
+class GemmaEncoder:
+    def __init__(
+        self,
+        model_id=_GEMMA_ID,
+        output_device="cuda",
+        output_dtype=torch.bfloat16,
+    ):
+        self.output_device = torch.device(output_device)
+        self.output_dtype  = output_dtype
+        print(f"[GemmaEncoder] loading {model_id} (CPU)")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.tokenizer.padding_side = "right"
+        self._model = (
+            AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+            .get_decoder().eval()
+        )
+        self._num_chi_tokens = len(self.tokenizer.encode(_CHI_PROMPT))
+        print("[GemmaEncoder] ready")
+    @torch.no_grad()
+    def encode(self, texts: list[str]) -> torch.Tensor:
+        """Returns [B, 300, 2304]."""
+        texts_full = [_CHI_PROMPT + t for t in texts]
+        max_len = self._num_chi_tokens + _TXT_MAX - 2
+        tok = self.tokenizer(
+            texts_full, max_length=max_len,
+            padding="max_length", truncation=True, return_tensors="pt",
+        )
+        emb = self._model(
+            input_ids=tok.input_ids,
+            attention_mask=tok.attention_mask,
+        ).last_hidden_state
+        emb = emb[:, _SELECT_IDX, :]
+        return emb.to(self.output_device).to(self.output_dtype)
+    @torch.no_grad()
+    def encode_null(self, batch_size: int) -> torch.Tensor:
+        """Returns [B, 300, 2304] for empty string (CFG unconditional)."""
+        tok = self.tokenizer(
+            [""] * batch_size, max_length=_TXT_MAX,
+            padding="max_length", truncation=True, return_tensors="pt",
+        )
+        emb = self._model(
+            input_ids=tok.input_ids,
+            attention_mask=tok.attention_mask,
+        ).last_hidden_state
+        return emb.to(self.output_device).to(self.output_dtype)

pixeldit/text_encoder_qwen.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+Qwen3-2B text encoder for PixelDiT.
+Requires a trained projection (train_qwen_proj.py) to map 2048→2304.
+Usage:
+    from pixeldit.text_encoder_qwen import QwenEncoder
+    enc  = QwenEncoder(proj_path="pixeldit/qwen_proj.pt")
+    cond = enc.encode(["a dragon at sunset"])  # [1, 300, 2304]
+    null = enc.encode_null(1)                  # [1, 300, 2304]
+"""
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel
+_QWEN_ID   = "Qwen/Qwen3-2B"
+_QWEN_DIM  = 2048
+_GEMMA_DIM = 2304
+_TXT_MAX   = 300
+class QwenEncoder:
+    def __init__(
+        self,
+        model_id=_QWEN_ID,
+        proj_path=None,           # path to trained qwen_proj.pt
+        output_device="cuda",
+        output_dtype=torch.bfloat16,
+    ):
+        self.output_device = torch.device(output_device)
+        self.output_dtype  = output_dtype
+        print(f"[QwenEncoder] loading {model_id} (CPU)")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.tokenizer.padding_side = "right"
+        self._model = AutoModel.from_pretrained(model_id, torch_dtype=torch.float32).eval()
+        self.proj = nn.Linear(_QWEN_DIM, _GEMMA_DIM, bias=False)
+        if proj_path:
+            sd = torch.load(proj_path, map_location="cpu", weights_only=True)
+            self.proj.load_state_dict(sd)
+            print(f"[QwenEncoder] loaded projection: {proj_path}")
+        else:
+            with torch.no_grad():
+                w = torch.zeros(_GEMMA_DIM, _QWEN_DIM)
+                w[:_QWEN_DIM] = torch.eye(_QWEN_DIM)
+                self.proj.weight.copy_(w)
+            print("[QwenEncoder] projection: identity init — run train_qwen_proj.py for real quality")
+        self.proj = self.proj.to(self.output_device).to(output_dtype)
+        print("[QwenEncoder] ready")
+    @torch.no_grad()
+    def encode(self, texts: list[str]) -> torch.Tensor:
+        """Returns [B, 300, 2304]."""
+        tok = self.tokenizer(
+            texts, max_length=_TXT_MAX,
+            padding="max_length", truncation=True, return_tensors="pt",
+        )
+        emb = self._model(**tok).last_hidden_state
+        emb = emb.to(self.output_device).to(self.output_dtype)
+        return self.proj(emb)
+    @torch.no_grad()
+    def encode_null(self, batch_size: int) -> torch.Tensor:
+        """Returns [B, 300, 2304] for empty string (CFG unconditional)."""
+        tok = self.tokenizer(
+            [""] * batch_size, max_length=_TXT_MAX,
+            padding="max_length", truncation=True, return_tensors="pt",
+        )
+        emb = self._model(**tok).last_hidden_state
+        emb = emb.to(self.output_device).to(self.output_dtype)
+        return self.proj(emb)