Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

README.md +109 -0
__init__.py +6 -0
autoencoder.py +188 -0
clip_encoder.py +155 -0
download_weights.py +41 -0
flux_model.py +447 -0
pipeline.py +410 -0
sampler.py +125 -0
t5_encoder.py +226 -0
tokenizers.py +150 -0
weight_loader.py +236 -0

README.md ADDED Viewed

	@@ -0,0 +1,109 @@

+---
+license: apache-2.0
+language:
+  - en
+library_name: mlx
+tags:
+  - mlx
+  - text-to-image
+  - apple-silicon
+  - image-generation
+  - diffusion
+  - flux
+base_model: black-forest-labs/FLUX.1-schnell
+pipeline_tag: text-to-image
+---
+# FLUX.1-schnell MLX Pipeline
+**Pure MLX (Apple Silicon) inference pipeline for [FLUX.1-schnell](https://huggingface.co/black-forest-labs/FLUX.1-schnell)** — a fast text-to-image model by Black Forest Labs.
+Zero PyTorch dependency. Runs natively on Apple Silicon via Metal GPU.
+## Highlights
+- **100% MLX native** — no torch, no diffusers needed
+- **4-bit quantization** support via `argmaxinc/mlx-FLUX.1-schnell-4bit-quantized`
+- **Fast 4-step generation** (FLUX.1-schnell is distilled for speed)
+- **T5-XXL + CLIP-L** dual text encoders
+- **FluxTransformer** with 19 Joint Blocks + 38 Single Blocks + N-dim RoPE
+## Architecture
+```
+FluxPipeline
+├── T5-XXL Encoder (24 layers, hidden=4096)
+│   └── Relative positional attention + GatedFFN
+├── CLIP-L Encoder (23 layers, hidden=768)
+│   └── Causal mask + EOS pooling
+├── FluxTransformer (DiT)
+│   ├── 19 JointTransformerBlock (txt+img joint attention)
+│   ├── 38 SingleTransformerBlock (img self-attention)
+│   └── N-dim RoPE (axes_dim=[16,56,56])
+├── AutoencoderKL Decoder
+│   └── Latent channels=16, block_out=[128,256,512,512]
+└── FlowMatchEuler Sampler
+```
+## Quick Start
+### Install
+```bash
+pip install mlx safetensors sentencepiece tokenizers pillow numpy
+```
+### Download Weights
+```bash
+# 4-bit quantized (recommended, ~5GB)
+huggingface-cli download argmaxinc/mlx-FLUX.1-schnell-4bit-quantized
+# Or full precision
+huggingface-cli download argmaxinc/mlx-FLUX.1-schnell
+```
+### Generate
+```python
+from pipeline import FluxPipeline
+pipe = FluxPipeline()
+pipe.load()
+result = pipe.generate_and_save(
+    prompt="a beautiful sunset over mountains",
+    output_path="output.png",
+    width=512,
+    height=512,
+    num_steps=4,
+    seed=42,
+)
+print(f"Generated in {result['elapsed_s']}s")
+pipe.unload()
+```
+## Files
+```
+├── pipeline.py          # Main inference pipeline
+├── flux_model.py        # FluxTransformer (JointBlock + SingleBlock)
+├── t5_encoder.py        # T5-XXL text encoder
+├── clip_encoder.py      # CLIP-L text encoder
+├── autoencoder.py       # VAE decoder
+├── sampler.py           # FlowMatch Euler sampler
+├── tokenizers.py        # T5 + CLIP tokenizers
+├── weight_loader.py     # Weight loading + key mapping
+└── download_weights.py  # HF Hub download helper
+```
+## Model Source
+Inference code is original work. Weights are loaded from:
+- [argmaxinc/mlx-FLUX.1-schnell-4bit-quantized](https://huggingface.co/argmaxinc/mlx-FLUX.1-schnell-4bit-quantized) (default)
+- [black-forest-labs/FLUX.1-schnell](https://huggingface.co/black-forest-labs/FLUX.1-schnell) (original)
+## License
+Apache 2.0

__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""MLX FLUX pipeline package.
+Provides a minimal FLUX.1-schnell diffusion pipeline implemented in
+pure MLX for Apple Silicon inference.  Uses pre-converted weights from
+HuggingFace (argmaxinc/mlx-FLUX.1-schnell or 4bit variant).
+"""

autoencoder.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""FLUX VAE decoder — param names match argmaxinc ae.safetensors keys.
+Weight key structure:
+    decoder.conv_in.*
+    decoder.mid.block_{1,2}.{norm1,conv1,norm2,conv2}.*
+    decoder.mid.attn_1.{norm,q,k,v,proj_out}.*
+    decoder.up.{0-3}.block.{0-2}.{norm1,conv1,norm2,conv2,nin_shortcut}.*
+    decoder.up.{1-3}.upsample.conv.*
+    decoder.norm_out.*
+    decoder.conv_out.*
+Note: up blocks are indexed in reverse — up.3 is the first decoder stage
+(highest channels), up.0 is the last (lowest channels).
+All conv weights loaded as PyTorch [O,I,kH,kW] are transposed to MLX
+[O,kH,kW,I] in the pipeline's _load_vae().
+"""
+from __future__ import annotations
+import mlx.core as mx
+import mlx.nn as nn
+# ── Building blocks (param names match weight keys) ──────────────────────────
+class ResnetBlock(nn.Module):
+    """Matches: block_{i}.{norm1,conv1,norm2,conv2,nin_shortcut}.*"""
+    def __init__(self, in_ch: int, out_ch: int):
+        super().__init__()
+        self.norm1 = nn.GroupNorm(32, in_ch)
+        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1)
+        self.norm2 = nn.GroupNorm(32, out_ch)
+        self.conv2 = nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1)
+        if in_ch != out_ch:
+            self.nin_shortcut = nn.Conv2d(in_ch, out_ch, kernel_size=1)
+        else:
+            self.nin_shortcut = None
+    def __call__(self, x):
+        h = nn.silu(self.norm1(x))
+        h = self.conv1(h)
+        h = nn.silu(self.norm2(h))
+        h = self.conv2(h)
+        if self.nin_shortcut is not None:
+            x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    """Matches: attn_1.{norm,q,k,v,proj_out}.*
+    Uses 1×1 Conv2d for Q/K/V/O projections (matching weight shapes).
+    """
+    def __init__(self, channels: int):
+        super().__init__()
+        self.norm = nn.GroupNorm(32, channels)
+        self.q = nn.Conv2d(channels, channels, kernel_size=1)
+        self.k = nn.Conv2d(channels, channels, kernel_size=1)
+        self.v = nn.Conv2d(channels, channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(channels, channels, kernel_size=1)
+    def __call__(self, x):
+        B, H, W, C = x.shape
+        h = self.norm(x)
+        q = self.q(h).reshape(B, H * W, C)
+        k = self.k(h).reshape(B, H * W, C)
+        v = self.v(h).reshape(B, H * W, C)
+        scale = C ** -0.5
+        attn = (q @ k.transpose(0, 2, 1)) * scale
+        attn = mx.softmax(attn, axis=-1)
+        h = (attn @ v).reshape(B, H, W, C)
+        return x + self.proj_out(h)
+class Upsample(nn.Module):
+    """Matches: upsample.conv.*"""
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+    def __call__(self, x):
+        B, H, W, C = x.shape
+        x = mx.repeat(x, 2, axis=1)
+        x = mx.repeat(x, 2, axis=2)
+        return self.conv(x)
+class UpBlock(nn.Module):
+    """One decoder up-stage. Matches: up.{i}.block.{0-2}.* + up.{i}.upsample.*"""
+    def __init__(self, in_ch: int, out_ch: int, num_blocks: int = 3, has_upsample: bool = True):
+        super().__init__()
+        self.block = [ResnetBlock(in_ch if j == 0 else out_ch, out_ch) for j in range(num_blocks)]
+        if has_upsample:
+            self.upsample = Upsample(out_ch)
+        else:
+            self.upsample = None
+    def __call__(self, x):
+        for b in self.block:
+            x = b(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+        return x
+class MidBlock(nn.Module):
+    """Matches: mid.{block_1, attn_1, block_2}.*"""
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block_1 = ResnetBlock(channels, channels)
+        self.attn_1 = AttnBlock(channels)
+        self.block_2 = ResnetBlock(channels, channels)
+    def __call__(self, x):
+        x = self.block_1(x)
+        x = self.attn_1(x)
+        x = self.block_2(x)
+        return x
+# ── Decoder ──────────────────────────────────────────────────────────────────
+class Decoder(nn.Module):
+    """VAE Decoder. Param paths match: decoder.{conv_in,mid,up,norm_out,conv_out}.*
+    Up block order (matching weight keys):
+        up.3 → 512→512 + upsample (first stage)
+        up.2 → 512→512 + upsample
+        up.1 → 512→256 + upsample
+        up.0 → 256→128 (no upsample, last stage)
+    """
+    def __init__(self):
+        super().__init__()
+        self.conv_in = nn.Conv2d(16, 512, kernel_size=3, padding=1)
+        self.mid = MidBlock(512)
+        # up blocks — indexed 0-3, processed in reverse order (3→2→1→0)
+        self.up = [
+            UpBlock(256, 128, num_blocks=3, has_upsample=False),  # up.0
+            UpBlock(512, 256, num_blocks=3, has_upsample=True),   # up.1
+            UpBlock(512, 512, num_blocks=3, has_upsample=True),   # up.2
+            UpBlock(512, 512, num_blocks=3, has_upsample=True),   # up.3
+        ]
+        self.norm_out = nn.GroupNorm(32, 128)
+        self.conv_out = nn.Conv2d(128, 3, kernel_size=3, padding=1)
+    def __call__(self, z):
+        h = self.conv_in(z)
+        h = self.mid(h)
+        # Process up blocks in reverse order: 3, 2, 1, 0
+        for i in reversed(range(len(self.up))):
+            h = self.up[i](h)
+        h = nn.silu(self.norm_out(h))
+        h = self.conv_out(h)
+        return h
+# ── AutoencoderKL ────────────────────────────────────────────────────────────
+class AutoencoderKL(nn.Module):
+    """FLUX VAE — decode-only path.
+    Input:  z [B, H/8, W/8, 16] (latent, channels-last)
+    Output: image [B, H, W, 3]  (RGB in [0, 1])
+    """
+    SCALE_FACTOR = 0.3611
+    SHIFT_FACTOR = 0.1159
+    def __init__(self):
+        super().__init__()
+        self.decoder = Decoder()
+    def decode(self, z: mx.array) -> mx.array:
+        z = z / self.SCALE_FACTOR + self.SHIFT_FACTOR
+        image = self.decoder(z)
+        image = mx.clip((image + 1.0) / 2.0, 0.0, 1.0)
+        return image

clip_encoder.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""CLIP-L text encoder for FLUX pipeline.
+Implements a 23-layer CLIP text encoder with absolute position embeddings
+and causal self-attention — matching the HuggingFace
+``openai/clip-vit-large-patch14`` architecture used by FLUX.1.
+Weight source: ``black-forest-labs/FLUX.1-schnell`` →
+``text_encoder/model.safetensors``
+"""
+from __future__ import annotations
+import math
+import mlx.core as mx
+import mlx.nn as nn
+# ── CLIP Config ──────────────────────────────────────────────────────────────
+class CLIPConfig:
+    vocab_size: int = 49408
+    d_model: int = 768
+    num_heads: int = 12
+    head_dim: int = 64  # d_model / num_heads
+    intermediate_size: int = 3072
+    num_layers: int = 23  # FLUX uses 23, not 12
+    max_position_embeddings: int = 77
+# ── Building blocks ──────────────────────────────────────────────────────────
+class CLIPAttention(nn.Module):
+    """CLIP multi-head self-attention."""
+    def __init__(self, cfg: CLIPConfig):
+        super().__init__()
+        self.num_heads = cfg.num_heads
+        self.head_dim = cfg.head_dim
+        d = cfg.d_model
+        self.q_proj = nn.Linear(d, d)
+        self.k_proj = nn.Linear(d, d)
+        self.v_proj = nn.Linear(d, d)
+        self.out_proj = nn.Linear(d, d)
+    def __call__(self, x: mx.array, causal_mask: mx.array | None = None) -> mx.array:
+        B, L, _ = x.shape
+        H, D = self.num_heads, self.head_dim
+        q = self.q_proj(x).reshape(B, L, H, D).transpose(0, 2, 1, 3)
+        k = self.k_proj(x).reshape(B, L, H, D).transpose(0, 2, 1, 3)
+        v = self.v_proj(x).reshape(B, L, H, D).transpose(0, 2, 1, 3)
+        scale = math.sqrt(D)
+        scores = (q @ k.transpose(0, 1, 3, 2)) / scale
+        if causal_mask is not None:
+            scores = scores + causal_mask
+        weights = mx.softmax(scores, axis=-1)
+        out = weights @ v
+        out = out.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.out_proj(out)
+class CLIPMLP(nn.Module):
+    """CLIP feed-forward network (GELU activation)."""
+    def __init__(self, cfg: CLIPConfig):
+        super().__init__()
+        self.fc1 = nn.Linear(cfg.d_model, cfg.intermediate_size)
+        self.fc2 = nn.Linear(cfg.intermediate_size, cfg.d_model)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.fc2(nn.gelu_approx(self.fc1(x)))
+class CLIPEncoderLayer(nn.Module):
+    """Single CLIP encoder layer: Norm → Attention → Norm → MLP."""
+    def __init__(self, cfg: CLIPConfig):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(cfg.d_model)
+        self.attn = CLIPAttention(cfg)
+        self.norm2 = nn.LayerNorm(cfg.d_model)
+        self.mlp = CLIPMLP(cfg)
+    def __call__(self, x: mx.array, causal_mask: mx.array | None = None) -> mx.array:
+        x = x + self.attn(self.norm1(x), causal_mask)
+        x = x + self.mlp(self.norm2(x))
+        return x
+# ── CLIP Encoder ─────────────────────────────────────────────────────────────
+class CLIPEncoder(nn.Module):
+    """CLIP-L text encoder: 23-layer transformer with absolute position embeddings.
+    Input:  token_ids [B, 77]
+    Output: (pooled [B, 768], hidden_states [B, 77, 768])
+    The pooled output is taken from the EOS token position (last
+    non-padding token), following the CLIP convention.
+    """
+    def __init__(self, cfg: CLIPConfig | None = None):
+        super().__init__()
+        if cfg is None:
+            cfg = CLIPConfig()
+        self.cfg = cfg
+        self.token_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.pos_emb = nn.Embedding(cfg.max_position_embeddings, cfg.d_model)
+        self.layers = [CLIPEncoderLayer(cfg) for _ in range(cfg.num_layers)]
+        self.final_norm = nn.LayerNorm(cfg.d_model)
+    def _build_causal_mask(self, seq_len: int) -> mx.array:
+        """Build causal attention mask [1, 1, L, L]."""
+        mask = mx.full((seq_len, seq_len), -1e9)
+        mask = mx.triu(mask, k=1)  # upper triangle = -inf, diagonal+below = 0
+        return mask.reshape(1, 1, seq_len, seq_len)
+    def __call__(self, token_ids: mx.array) -> tuple[mx.array, mx.array]:
+        B, L = token_ids.shape
+        # Embeddings
+        positions = mx.arange(L)
+        x = self.token_emb(token_ids) + self.pos_emb(positions)
+        # Causal mask
+        causal_mask = self._build_causal_mask(L)
+        # Transformer layers
+        for layer in self.layers:
+            x = layer(x, causal_mask)
+        x = self.final_norm(x)  # [B, L, d_model]
+        # Pooled output: EOS token position
+        # Find the EOS token (49407) or use the last non-zero position
+        eos_id = 49407
+        # For each batch element, find the position of EOS
+        # Simple approach: use argmax on (token_ids == eos_id)
+        eos_mask = (token_ids == eos_id).astype(mx.int32)
+        # If EOS not found, use last position
+        has_eos = mx.sum(eos_mask, axis=-1, keepdims=True) > 0
+        eos_pos = mx.argmax(eos_mask, axis=-1)  # [B]
+        # Gather pooled output
+        idx = eos_pos.reshape(B, 1, 1)
+        idx = mx.broadcast_to(idx, (B, 1, x.shape[-1]))
+        pooled = mx.take_along_axis(x, idx, axis=1).squeeze(1)  # [B, d_model]
+        return pooled, x

download_weights.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env python3
+"""Download shared T5/CLIP weights for the self-built FLUX MLX pipeline.
+Stores files locally in backends/mlx_flux/weights/ so they are co-located
+with our code and won't be accidentally deleted as "unused HF cache".
+"""
+import os
+import shutil
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+REPO = "black-forest-labs/FLUX.1-schnell"
+WEIGHTS_DIR = Path(__file__).parent / "weights"
+FILES = [
+    # (HF repo path, local filename)
+    ("text_encoder_2/model-00001-of-00002.safetensors", "t5_shard1.safetensors"),
+    ("text_encoder_2/model-00002-of-00002.safetensors", "t5_shard2.safetensors"),
+    ("text_encoder/model.safetensors", "clip_text_encoder.safetensors"),
+    ("tokenizer_2/spiece.model", "t5_spiece.model"),
+    ("tokenizer/vocab.json", "clip_vocab.json"),
+    ("tokenizer/merges.txt", "clip_merges.txt"),
+]
+def main():
+    WEIGHTS_DIR.mkdir(exist_ok=True)
+    for hf_path, local_name in FILES:
+        dest = WEIGHTS_DIR / local_name
+        if dest.exists() and dest.stat().st_size > 0:
+            print(f"  SKIP  {local_name} (already exists, {dest.stat().st_size / 1024 / 1024:.1f} MB)")
+            continue
+        print(f"  DOWNLOADING  {hf_path} -> {local_name} ...")
+        cached = hf_hub_download(REPO, hf_path)
+        shutil.copy2(cached, dest)
+        print(f"  OK  {local_name} ({dest.stat().st_size / 1024 / 1024:.1f} MB)")
+    print("\nAll weights ready in", WEIGHTS_DIR)
+if __name__ == "__main__":
+    main()

flux_model.py ADDED Viewed

	@@ -0,0 +1,447 @@

+"""FLUX DiT — rewritten to match mflux reference implementation exactly.
+Parameter names match argmaxinc/mlx-FLUX.1-schnell weights.
+Forward pass logic matches filipstrand/mflux.
+"""
+from __future__ import annotations
+import math
+import mlx.core as mx
+import mlx.nn as nn
+# ── Config ───────────────────────────────────────────────────────────────────
+class FluxConfig:
+    hidden_size: int = 3072
+    num_heads: int = 24
+    head_dim: int = 128
+    mlp_ratio: float = 4.0
+    num_joint_blocks: int = 19
+    num_single_blocks: int = 38
+    axes_dim: tuple[int, ...] = (16, 56, 56)
+    theta: float = 10_000.0
+    in_channels: int = 64
+    context_dim: int = 4096
+    pooled_dim: int = 768
+# ── RoPE (matches mflux EmbedND) ────────────────────────────────────────────
+def _rope_single_axis(pos: mx.array, dim: int, theta: float) -> mx.array:
+    """Compute RoPE for one positional axis.
+    Returns [B, seq, dim//2, 2, 2] rotation matrices.
+    """
+    scale = mx.arange(0, dim, 2, dtype=mx.float32) / dim
+    omega = 1.0 / (theta ** scale)
+    # pos: [B, seq], omega: [dim//2]
+    out = pos[:, :, None].astype(mx.float32) * omega[None, :]  # [B, seq, dim//2]
+    cos_out = mx.cos(out)
+    sin_out = mx.sin(out)
+    # Stack as 2x2 rotation matrix: [[cos, -sin], [sin, cos]]
+    stacked = mx.stack([cos_out, -sin_out, sin_out, cos_out], axis=-1)
+    return stacked.reshape(pos.shape[0], -1, dim // 2, 2, 2)
+def compute_rope(ids: mx.array, axes_dim=(16, 56, 56), theta=10000.0) -> mx.array:
+    """Compute N-dimensional RoPE embeddings.
+    Args:
+        ids: [B, seq, 3] position IDs (time, height, width)
+    Returns:
+        [B, 1, seq, head_dim//2, 2, 2] rotation matrices
+    """
+    emb = mx.concatenate([
+        _rope_single_axis(ids[..., i], axes_dim[i], theta)
+        for i in range(3)
+    ], axis=-3)  # concat along the dim//2 axis → total = sum(axes_dim)//2 = 64
+    return emb[:, None]  # [B, 1, seq, 64, 2, 2]
+def apply_rope(q: mx.array, k: mx.array, freqs: mx.array):
+    """Apply rotary embeddings to q and k (matches mflux exactly).
+    q, k: [B, H, L, D] where D = head_dim
+    freqs: [B, 1, L, D//2, 2, 2]
+    """
+    # Reshape to pairs: [B, H, L, D//2, 1, 2]
+    xq_ = q.astype(mx.float32).reshape(*q.shape[:-1], -1, 1, 2)
+    xk_ = k.astype(mx.float32).reshape(*k.shape[:-1], -1, 1, 2)
+    # freqs[..., 0] = [[cos, -sin]] shape [..., 2]
+    # freqs[..., 1] = [[sin, cos]]  shape [..., 2]
+    # xq_[..., 0] = first of pair (scalar), xq_[..., 1] = second of pair
+    xq_out = freqs[..., 0] * xq_[..., 0] + freqs[..., 1] * xq_[..., 1]
+    xk_out = freqs[..., 0] * xk_[..., 0] + freqs[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*q.shape).astype(mx.float32), xk_out.reshape(*k.shape).astype(mx.float32)
+# ── Timestep embedding (matches mflux TimeTextEmbed) ─────────────────────────
+def timestep_embedding(t: mx.array, dim: int = 256) -> mx.array:
+    """Sinusoidal timestep embedding with half-freq swap (mflux convention).
+    Output: [B, dim] with [cos_high, sin_low] → [sin_high, cos_low] swap.
+    """
+    half = dim // 2
+    freqs = mx.exp(-math.log(10000.0) * mx.arange(half, dtype=mx.float32) / half)
+    args = t[:, None].astype(mx.float32) * freqs[None, :]
+    emb = mx.concatenate([mx.sin(args), mx.cos(args)], axis=-1)
+    # mflux swaps halves: [sin, cos] → [cos_high_half, sin_low_half]
+    emb = mx.concatenate([emb[:, half:], emb[:, :half]], axis=-1)
+    return emb
+# ── AdaLN modulation (matches mflux AdaLayerNormZero) ────────────────────────
+class AdaLNModulation(nn.Module):
+    """Matches: adaLN_modulation.layers.1.*
+    layers.0 = SiLU (no params), layers.1 = Linear.
+    """
+    def __init__(self, dim: int, n_params: int):
+        super().__init__()
+        self.layers = [nn.SiLU(), nn.Linear(dim, n_params * dim)]
+    def __call__(self, x: mx.array) -> mx.array:
+        for layer in self.layers:
+            x = layer(x)
+        return x
+# ── QK Norm ──────────────────────────────────────────────────────────────────
+class QKNorm(nn.Module):
+    """Matches: qk_norm.{q_norm, k_norm}.weight"""
+    def __init__(self, dim: int):
+        super().__init__()
+        self.q_norm = nn.RMSNorm(dim)
+        self.k_norm = nn.RMSNorm(dim)
+# ── Attention ────────────────────────────────────────────────────────────────
+class Attention(nn.Module):
+    """Separate Q/K/V/O projections. Matches: attn.{q,k,v,o}_proj.*"""
+    def __init__(self, dim: int, num_heads: int):
+        super().__init__()
+        self.q_proj = nn.Linear(dim, dim)
+        self.k_proj = nn.Linear(dim, dim)
+        self.v_proj = nn.Linear(dim, dim)
+        self.o_proj = nn.Linear(dim, dim)
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+# ── MLP ──────────────────────────────────────────────────────────────────────
+class MLP(nn.Module):
+    """Matches: mlp.{fc1,fc2}.*"""
+    def __init__(self, dim: int, hidden: int):
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden)
+        self.fc2 = nn.Linear(hidden, dim)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.fc2(nn.gelu(self.fc1(x)))
+# ── Joint Transformer Block ─────────────────────────────────────────────────
+class ImageTransformerSubBlock(nn.Module):
+    """Image side of joint block.
+    Matches: image_transformer_block.{adaLN_modulation, attn, mlp, qk_norm}.*
+    """
+    def __init__(self, cfg: FluxConfig):
+        super().__init__()
+        H = cfg.hidden_size
+        self.adaLN_modulation = AdaLNModulation(H, 6)
+        self.attn = Attention(H, cfg.num_heads)
+        self.mlp = MLP(H, int(H * cfg.mlp_ratio))
+        self.qk_norm = QKNorm(cfg.head_dim)
+class TextTransformerSubBlock(nn.Module):
+    """Text side of joint block.
+    Matches: text_transformer_block.{adaLN_modulation, attn, mlp, qk_norm}.*
+    NOTE: text side uses hidden_size/2 = 1536 for FFN (mflux convention).
+    """
+    def __init__(self, cfg: FluxConfig):
+        super().__init__()
+        H = cfg.hidden_size
+        self.adaLN_modulation = AdaLNModulation(H, 6)
+        self.attn = Attention(H, cfg.num_heads)
+        # Text FFN uses gelu_approx and different hidden dim
+        self.mlp = MLP(H, int(H * cfg.mlp_ratio))
+        self.qk_norm = QKNorm(cfg.head_dim)
+class JointTransformerBlock(nn.Module):
+    """Matches: multimodal_transformer_blocks.{i}.*"""
+    def __init__(self, cfg: FluxConfig):
+        super().__init__()
+        self.image_transformer_block = ImageTransformerSubBlock(cfg)
+        self.text_transformer_block = TextTransformerSubBlock(cfg)
+        self._num_heads = cfg.num_heads
+        self._head_dim = cfg.head_dim
+    def __call__(self, img, txt, vec, rope_emb):
+        B = img.shape[0]
+        H, D = self._num_heads, self._head_dim
+        img_len = img.shape[1]
+        txt_len = txt.shape[1]
+        # 1. AdaLN modulation
+        img_params = self.image_transformer_block.adaLN_modulation(vec)
+        i_s1, i_sc1, i_g1, i_s2, i_sc2, i_g2 = mx.split(img_params, 6, axis=-1)
+        txt_params = self.text_transformer_block.adaLN_modulation(vec)
+        t_s1, t_sc1, t_g1, t_s2, t_sc2, t_g2 = mx.split(txt_params, 6, axis=-1)
+        # 2. LayerNorm(affine=False) + modulate
+        img_norm = nn.LayerNorm(img.shape[-1], affine=False, eps=1e-6)(img)
+        img_norm = img_norm * (1 + i_sc1[:, None, :]) + i_s1[:, None, :]
+        txt_norm = nn.LayerNorm(txt.shape[-1], affine=False, eps=1e-6)(txt)
+        txt_norm = txt_norm * (1 + t_sc1[:, None, :]) + t_s1[:, None, :]
+        # 3. Q/K/V projections + QK norm
+        img_q = self.image_transformer_block.attn.q_proj(img_norm).reshape(B, img_len, H, D)
+        img_k = self.image_transformer_block.attn.k_proj(img_norm).reshape(B, img_len, H, D)
+        img_v = self.image_transformer_block.attn.v_proj(img_norm).reshape(B, img_len, H, D)
+        txt_q = self.text_transformer_block.attn.q_proj(txt_norm).reshape(B, txt_len, H, D)
+        txt_k = self.text_transformer_block.attn.k_proj(txt_norm).reshape(B, txt_len, H, D)
+        txt_v = self.text_transformer_block.attn.v_proj(txt_norm).reshape(B, txt_len, H, D)
+        img_q = self.image_transformer_block.qk_norm.q_norm(img_q)
+        img_k = self.image_transformer_block.qk_norm.k_norm(img_k)
+        txt_q = self.text_transformer_block.qk_norm.q_norm(txt_q)
+        txt_k = self.text_transformer_block.qk_norm.k_norm(txt_k)
+        # 4. Concat for joint attention: [txt, img]
+        q = mx.concatenate([txt_q, img_q], axis=1).transpose(0, 2, 1, 3)  # [B,H,L,D]
+        k = mx.concatenate([txt_k, img_k], axis=1).transpose(0, 2, 1, 3)
+        v = mx.concatenate([txt_v, img_v], axis=1).transpose(0, 2, 1, 3)
+        # 5. RoPE
+        q, k = apply_rope(q, k, rope_emb)
+        # 6. Attention
+        scale = math.sqrt(D)
+        scores = (q @ k.transpose(0, 1, 3, 2)) / scale
+        weights = mx.softmax(scores, axis=-1)
+        attn_out = (weights @ v).transpose(0, 2, 1, 3).reshape(B, txt_len + img_len, -1)
+        # 7. Split and project
+        txt_attn = attn_out[:, :txt_len, :]
+        img_attn = attn_out[:, txt_len:, :]
+        # 8. Gated residual for attention
+        img_attn = self.image_transformer_block.attn.o_proj(img_attn)
+        txt_attn = self.text_transformer_block.attn.o_proj(txt_attn)
+        img = img + i_g1[:, None, :] * img_attn
+        txt = txt + t_g1[:, None, :] * txt_attn
+        # 9. FFN with LayerNorm(affine=False) + modulate
+        img_ff_in = nn.LayerNorm(img.shape[-1], affine=False, eps=1e-6)(img)
+        img_ff_in = img_ff_in * (1 + i_sc2[:, None, :]) + i_s2[:, None, :]
+        img = img + i_g2[:, None, :] * self.image_transformer_block.mlp(img_ff_in)
+        txt_ff_in = nn.LayerNorm(txt.shape[-1], affine=False, eps=1e-6)(txt)
+        txt_ff_in = txt_ff_in * (1 + t_sc2[:, None, :]) + t_s2[:, None, :]
+        txt = txt + t_g2[:, None, :] * self.text_transformer_block.mlp(txt_ff_in)
+        return img, txt
+# ── Single Transformer Block ────────────────────────────────────────────────
+class SingleTransformerSubBlock(nn.Module):
+    """Matches: unified_transformer_blocks.{i}.transformer_block.*"""
+    def __init__(self, cfg: FluxConfig):
+        super().__init__()
+        H = cfg.hidden_size
+        mlp_hidden = int(H * cfg.mlp_ratio)
+        self.adaLN_modulation = AdaLNModulation(H, 3)
+        self.attn = Attention(H, cfg.num_heads)
+        self.mlp = MLP(H, mlp_hidden)
+        self.qk_norm = QKNorm(cfg.head_dim)
+    def __call__(self, x, vec, rope_emb):
+        B, L, D = x.shape
+        H, HD = self._get_heads()
+        residual = x
+        # 1. AdaLN
+        params = self.adaLN_modulation(vec)
+        shift, scale, gate = mx.split(params, 3, axis=-1)
+        x_norm = nn.LayerNorm(D, affine=False, eps=1e-6)(x)
+        x_norm = x_norm * (1 + scale[:, None, :]) + shift[:, None, :]
+        # 2. Attention with QK norm
+        q = self.attn.q_proj(x_norm).reshape(B, L, H, HD)
+        k = self.attn.k_proj(x_norm).reshape(B, L, H, HD)
+        v = self.attn.v_proj(x_norm).reshape(B, L, H, HD)
+        q = self.qk_norm.q_norm(q)
+        k = self.qk_norm.k_norm(k)
+        q = q.transpose(0, 2, 1, 3)
+        k = k.transpose(0, 2, 1, 3)
+        v = v.transpose(0, 2, 1, 3)
+        q, k = apply_rope(q, k, rope_emb)
+        sc = math.sqrt(HD)
+        scores = (q @ k.transpose(0, 1, 3, 2)) / sc
+        w = mx.softmax(scores, axis=-1)
+        attn_out = (w @ v).transpose(0, 2, 1, 3).reshape(B, L, -1)
+        # 3. Parallel MLP
+        mlp_out = nn.gelu_approx(self.mlp.fc1(x_norm))
+        # 4. Concat(attn, mlp) → project → gate
+        combined = mx.concatenate([attn_out, mlp_out], axis=-1)
+        # proj_out dimensions: attn_dim(3072) + mlp_hidden(12288) → 3072
+        # But we don't have proj_out as a separate param — use attn.o_proj for attn part
+        # and mlp.fc2 for mlp part, then add
+        attn_projected = self.attn.o_proj(attn_out)
+        mlp_projected = self.mlp.fc2(mlp_out)
+        out = gate[:, None, :] * (attn_projected + mlp_projected)
+        return residual + out
+    def _get_heads(self):
+        return 24, 128
+class SingleTransformerBlock(nn.Module):
+    """Wrapper to match key path: unified_transformer_blocks.{i}.transformer_block.*"""
+    def __init__(self, cfg: FluxConfig):
+        super().__init__()
+        self.transformer_block = SingleTransformerSubBlock(cfg)
+    def __call__(self, x, vec, rope_emb):
+        return self.transformer_block(x, vec, rope_emb)
+# ── FLUX Transformer ────────────────────────────────────────────────────────
+class FluxTransformer(nn.Module):
+    """FLUX DiT matching argmaxinc weights + mflux forward pass."""
+    def __init__(self, cfg: FluxConfig | None = None):
+        super().__init__()
+        if cfg is None:
+            cfg = FluxConfig()
+        self.cfg = cfg
+        H = cfg.hidden_size
+        # x_embedder: Linear (NOT Conv2d) — matches mflux
+        self.x_embedder = nn.Linear(cfg.in_channels, H)
+        # context_embedder
+        self.context_embedder = nn.Linear(cfg.context_dim, H)
+        # Timestep + text embeddings (match mflux naming)
+        self.t_embedder = _TimestepEmbedder(H)
+        self.y_embedder = _TextEmbedder(cfg.pooled_dim, H)
+        # Transformer blocks
+        self.multimodal_transformer_blocks = [
+            JointTransformerBlock(cfg) for _ in range(cfg.num_joint_blocks)
+        ]
+        self.unified_transformer_blocks = [
+            SingleTransformerBlock(cfg) for _ in range(cfg.num_single_blocks)
+        ]
+        # Final layer (matches mflux AdaLayerNormContinuous)
+        self.final_layer = _FinalLayer(H, cfg.in_channels)
+    def __call__(self, img, img_ids, txt, txt_ids, y, timesteps):
+        # 1. Embeddings
+        img = self.x_embedder(img)           # [B, seq, 64] → [B, seq, 3072]
+        txt = self.context_embedder(txt)     # [B, seq, 4096] → [B, seq, 3072]
+        # 2. Timestep conditioning — timesteps already scaled to [0, 1000] by scheduler
+        t_emb = timestep_embedding(timesteps, 256)
+        vec = self.t_embedder(t_emb) + self.y_embedder(y)
+        # 3. RoPE for full sequence [txt, img]
+        all_ids = mx.concatenate([txt_ids, img_ids], axis=1)
+        rope_emb = compute_rope(all_ids, self.cfg.axes_dim, self.cfg.theta)
+        # 4. Joint blocks
+        for block in self.multimodal_transformer_blocks:
+            img, txt = block(img, txt, vec, rope_emb)
+        # 5. Concat for single blocks
+        img = mx.concatenate([txt, img], axis=1)
+        # 6. Single blocks (rope covers full sequence)
+        for block in self.unified_transformer_blocks:
+            img = block(img, vec, rope_emb)
+        # 7. Extract img tokens, apply final layer
+        img = img[:, txt.shape[1]:, :]
+        img = self.final_layer(img, vec)
+        return img
+# ── Helper modules (match mflux TimestepEmbedder/TextEmbedder naming) ────────
+class _TimestepEmbedder(nn.Module):
+    """Matches: t_embedder.mlp.layers.{0,2}.*"""
+    def __init__(self, dim):
+        super().__init__()
+        self.mlp = _MLP2(256, dim)
+    def __call__(self, x):
+        return self.mlp(x)
+class _TextEmbedder(nn.Module):
+    """Matches: y_embedder.mlp.layers.{0,2}.*"""
+    def __init__(self, in_dim, dim):
+        super().__init__()
+        self.mlp = _MLP2(in_dim, dim)
+    def __call__(self, x):
+        return self.mlp(x)
+class _MLP2(nn.Module):
+    """Two-layer MLP with SiLU. Matches: mlp.layers.{0,2}.*"""
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.layers = [nn.Linear(in_dim, out_dim), nn.SiLU(), nn.Linear(out_dim, out_dim)]
+    def __call__(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class _FinalLayer(nn.Module):
+    """Matches: final_layer.{adaLN_modulation, linear}.*
+    Uses AdaLayerNormContinuous: LayerNorm(affine=False) + Linear(bias=False) modulation.
+    """
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.adaLN_modulation = AdaLNModulation(hidden_size, 2)
+        self.linear = nn.Linear(hidden_size, out_channels)
+    def __call__(self, x, vec):
+        params = self.adaLN_modulation(vec)
+        scale, shift = mx.split(params, 2, axis=-1)
+        x = nn.LayerNorm(x.shape[-1], affine=False, eps=1e-6)(x)
+        x = x * (1 + scale[:, None, :]) + shift[:, None, :]
+        return self.linear(x)

pipeline.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""FLUX.1-schnell pipeline — end-to-end text-to-image on MLX.
+Orchestrates the full inference chain:
+  1. Tokenize prompt (T5 + CLIP)
+  2. Encode text (T5 → embeddings, CLIP → pooled)
+  3. Initialize latent noise + patchify
+  4. Denoising loop (rectified flow, Euler steps)
+  5. Unpatchify + VAE decode → PIL.Image
+Memory strategy: components are loaded/unloaded in phases so that
+only one large model occupies unified memory at a time.
+Usage::
+    pipe = FluxPipeline()
+    pipe.load("argmaxinc/mlx-FLUX.1-schnell-4bit-quantized")
+    img = pipe.generate("a cat in a garden", steps=4, width=512, height=512)
+    img.save("output.png")
+"""
+from __future__ import annotations
+import logging
+import os
+import time
+from pathlib import Path
+import mlx.core as mx
+import mlx.nn as nn
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from .autoencoder import AutoencoderKL
+from .clip_encoder import CLIPEncoder
+from .flux_model import FluxTransformer
+from .sampler import FlowMatchEulerScheduler, compute_img_ids, patchify, unpatchify
+from .t5_encoder import T5Encoder
+from .tokenizers import CLIPTokenizer, T5Tokenizer
+from . import weight_loader
+logger = logging.getLogger("image-server")
+# ── Model repos ──────────────────────────────────────────────────────────────
+DEFAULT_MODEL = "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized"
+FULL_MODEL = "argmaxinc/mlx-FLUX.1-schnell"
+T5_CLIP_REPO = "black-forest-labs/FLUX.1-schnell"
+# Local weights directory (co-located with pipeline code, won't be
+# accidentally deleted when cleaning HF cache)
+_WEIGHTS_DIR = Path(__file__).parent / "weights"
+class FluxPipeline:
+    """FLUX.1-schnell pipeline for MLX (Apple Silicon).
+    Manages model loading, text encoding, denoising, and VAE decoding
+    with phase-based memory management.
+    """
+    def __init__(self, model_id: str | None = None, quantize: bool = False):
+        self.model_id = model_id or DEFAULT_MODEL
+        self.quantize = quantize
+        # Performance options
+        import os
+        self.use_compile = os.environ.get("IMAGE_MLX_COMPILE", "0") in ("1", "true", "yes")
+        self.phased_memory = os.environ.get("IMAGE_MLX_PHASED_MEM", "1") not in ("0", "false", "no")
+        # Components (lazily loaded)
+        self.t5_tokenizer: T5Tokenizer | None = None
+        self.clip_tokenizer: CLIPTokenizer | None = None
+        self.t5_encoder: T5Encoder | None = None
+        self.clip_encoder: CLIPEncoder | None = None
+        self.transformer: FluxTransformer | None = None
+        self.vae: AutoencoderKL | None = None
+        self.scheduler = FlowMatchEulerScheduler()
+        self._loaded = False
+    # ── Loading ──────────────────────────────────────────────────────────
+    def load(self, model_id: str | None = None) -> None:
+        """Download and load all model components."""
+        repo_id = model_id or self.model_id
+        t0 = time.time()
+        logger.info("[MLX] Loading FLUX pipeline from %s ...", repo_id)
+        # 1. Download files
+        dit_path = self._download_dit(repo_id)
+        ae_path = hf_hub_download(repo_id, "ae.safetensors")
+        t5_spiece = self._local_or_hf("t5_spiece.model", "tokenizer_2/spiece.model")
+        clip_vocab = self._local_or_hf("clip_vocab.json", "tokenizer/vocab.json")
+        clip_merges = self._local_or_hf("clip_merges.txt", "tokenizer/merges.txt")
+        # T5 weights (multi-shard)
+        t5_paths = self._download_t5_weights()
+        # CLIP weights
+        clip_path = self._download_clip_weights()
+        # 2. Init tokenizers
+        self.t5_tokenizer = T5Tokenizer(t5_spiece, max_length=256)
+        self.clip_tokenizer = CLIPTokenizer(clip_vocab, clip_merges, max_length=77)
+        # 3. Build models
+        self.t5_encoder = T5Encoder()
+        self.clip_encoder = CLIPEncoder()
+        self.transformer = FluxTransformer()
+        self.vae = AutoencoderKL()
+        # 4. Load weights (each component independently — partial loading OK)
+        if t5_paths:
+            try:
+                weight_loader.load_t5(t5_paths, self.t5_encoder)
+            except Exception as exc:
+                logger.warning("[MLX] T5 weight loading failed: %s — using random init", exc)
+        if clip_path:
+            try:
+                weight_loader.load_clip(clip_path, self.clip_encoder)
+            except Exception as exc:
+                logger.warning("[MLX] CLIP weight loading failed: %s — using random init", exc)
+        # DiT weights — load directly into transformer model
+        try:
+            self._load_dit(dit_path)
+        except Exception as exc:
+            logger.warning("[MLX] DiT weight loading failed: %s — using random init", exc)
+        # VAE weights
+        try:
+            self._load_vae(ae_path)
+        except Exception as exc:
+            logger.warning("[MLX] VAE weight loading failed: %s — using random init", exc)
+        self._loaded = True
+        logger.info("[MLX] FLUX pipeline ready (%.1fs)", time.time() - t0)
+    def _download_dit(self, repo_id: str) -> str:
+        """Download DiT weights file."""
+        if "4bit" in repo_id:
+            return hf_hub_download(repo_id, "flux-schnell-4bit-quantized.safetensors")
+        return hf_hub_download(repo_id, "flux-schnell.safetensors")
+    def _download_t5_weights(self) -> list[str]:
+        """Get T5-XXL encoder weight paths — local weights/ dir first, HF fallback."""
+        local1 = _WEIGHTS_DIR / "t5_shard1.safetensors"
+        local2 = _WEIGHTS_DIR / "t5_shard2.safetensors"
+        if local1.exists() and local2.exists():
+            logger.info("[MLX] T5 weights ready (local)")
+            return [str(local1), str(local2)]
+        try:
+            p1 = hf_hub_download(T5_CLIP_REPO, "text_encoder_2/model-00001-of-00002.safetensors")
+            p2 = hf_hub_download(T5_CLIP_REPO, "text_encoder_2/model-00002-of-00002.safetensors")
+            logger.info("[MLX] T5 weights ready (HF cache)")
+            return [p1, p2]
+        except Exception as exc:
+            logger.warning("[MLX] T5 weights not available: %s — text encoding will be limited", exc)
+            return []
+    def _download_clip_weights(self) -> str | None:
+        """Get CLIP encoder weight path — local weights/ dir first, HF fallback."""
+        local = _WEIGHTS_DIR / "clip_text_encoder.safetensors"
+        if local.exists():
+            logger.info("[MLX] CLIP weights ready (local)")
+            return str(local)
+        try:
+            path = hf_hub_download(T5_CLIP_REPO, "text_encoder/model.safetensors")
+            logger.info("[MLX] CLIP weights ready (HF cache)")
+            return path
+        except Exception as exc:
+            logger.warning("[MLX] CLIP weights not available: %s", exc)
+            return None
+    @staticmethod
+    def _local_or_hf(local_name: str, hf_path: str) -> str:
+        """Return local path if exists, else download from HF."""
+        local = _WEIGHTS_DIR / local_name
+        if local.exists():
+            return str(local)
+        return hf_hub_download(T5_CLIP_REPO, hf_path)
+    def _load_dit(self, path: str) -> None:
+        """Load DiT weights into transformer.
+        For 4-bit quantized models, quantizes the model's Linear layers
+        to QuantizedLinear first so load_weights can accept uint32 triplets.
+        """
+        logger.info("[MLX] Loading DiT weights from %s ...", os.path.basename(path))
+        # Use weight_loader's robust safetensors reader (handles bfloat16)
+        weights = weight_loader._load_safetensors(path)
+        # Detect if quantized: check for `.scales` keys
+        is_quantized = any(k.endswith(".scales") for k in weights)
+        if is_quantized:
+            logger.info("[MLX] Detected quantized weights — converting model layers")
+            # Quantize Linear layers except x_embedder (its weight is float, not quantized)
+            def _should_quantize(path, module):
+                if "x_embedder" in path:
+                    return False
+                return isinstance(module, nn.Linear)
+            nn.quantize(self.transformer, group_size=64, bits=4, class_predicate=_should_quantize)
+        # Map x_embedder.proj.* → x_embedder.* (Conv2d weights → Linear)
+        remapped = {}
+        for k, v in weights.items():
+            new_k = k
+            if k.startswith("x_embedder.proj."):
+                new_k = k.replace("x_embedder.proj.", "x_embedder.")
+                # Squeeze conv dimensions: [out, 1, 1, in] → [out, in]
+                if v.ndim == 4:
+                    v = v.squeeze()
+            remapped[new_k] = v
+        pairs = list(remapped.items())
+        self.transformer.load_weights(pairs, strict=False)
+        logger.info("[MLX] DiT loaded: %d weight tensors (quantized=%s)", len(pairs), is_quantized)
+    def _load_vae(self, path: str) -> None:
+        """Load VAE weights. Transposes conv weights from PyTorch NCHW to MLX NHWC."""
+        logger.info("[MLX] Loading VAE weights from %s ...", os.path.basename(path))
+        weights = weight_loader._load_safetensors(path)
+        # Only keep decoder weights (skip encoder)
+        transposed = {}
+        n_transposed = 0
+        for k, v in weights.items():
+            if not k.startswith("decoder."):
+                continue
+            # Transpose conv weights: PyTorch [O, I, kH, kW] → MLX [O, kH, kW, I]
+            if v.ndim == 4:
+                v = v.transpose(0, 2, 3, 1)
+                n_transposed += 1
+            transposed[k] = v
+        pairs = list(transposed.items())
+        self.vae.load_weights(pairs, strict=False)
+        logger.info("[MLX] VAE loaded: %d tensors (%d conv transposed)", len(pairs), n_transposed)
+    # ── Generation ───────────────────────────────────────────────────────
+    def generate(
+        self,
+        prompt: str,
+        *,
+        width: int = 512,
+        height: int = 512,
+        steps: int = 4,
+        seed: int | None = None,
+        progress_callback=None,
+    ) -> Image.Image:
+        """Generate an image from a text prompt.
+        Args:
+            prompt: Text description.
+            width: Image width (rounded to multiple of 16).
+            height: Image height (rounded to multiple of 16).
+            steps: Denoising steps (default 4 for schnell).
+            seed: Random seed for reproducibility.
+            progress_callback: fn(step, total_steps) called per step.
+        Returns:
+            PIL.Image.
+        """
+        if not self._loaded:
+            raise RuntimeError("Pipeline not loaded. Call load() first.")
+        # Round to multiple of 16
+        width = (width // 16) * 16
+        height = (height // 16) * 16
+        if seed is not None:
+            mx.random.seed(seed)
+        # ── Phase 1: Text encoding ──────────────────────────────────────
+        logger.info("[MLX] Phase 1: Text encoding...")
+        t0 = time.time()
+        t5_ids = self.t5_tokenizer.tokenize(prompt)
+        clip_ids = self.clip_tokenizer.tokenize(prompt)
+        t5_embed = self.t5_encoder(t5_ids)           # [1, 256, 4096]
+        clip_pooled, _ = self.clip_encoder(clip_ids)  # [1, 768]
+        mx.eval(t5_embed, clip_pooled)
+        logger.info("[MLX] Text encoding done (%.1fs)", time.time() - t0)
+        # ── Phase 1→2 transition: free text encoders ────────────────────
+        if self.phased_memory:
+            logger.info("[MLX] Releasing text encoders to free memory...")
+            self.t5_encoder = None
+            self.clip_encoder = None
+            mx.clear_cache()
+        # ── Phase 2: Denoising ──────────────────────────────────────────
+        logger.info("[MLX] Phase 2: Denoising (%d steps)...", steps)
+        t1 = time.time()
+        lat_h = height // 8
+        lat_w = width // 8
+        # Initial noise
+        noise = mx.random.normal((1, lat_h, lat_w, 16))
+        latents = patchify(noise)  # [1, seq, 64]
+        img_ids = compute_img_ids(lat_h, lat_w)
+        txt_ids = mx.zeros((1, t5_embed.shape[1], 3), dtype=mx.int32)
+        # Compute sigma schedule with exponential time shift
+        image_seq_len = latents.shape[1]
+        timesteps_list, sigmas = self.scheduler.compute_sigmas(steps, image_seq_len)
+        # Optionally compile the transformer forward pass for speed
+        _forward_fn = self.transformer
+        if self.use_compile:
+            try:
+                _forward_fn = mx.compile(self.transformer)
+                logger.info("[MLX] Using mx.compile for DiT forward pass")
+            except Exception as exc:
+                logger.warning("[MLX] mx.compile not available: %s", exc)
+        for i in range(steps):
+            sigma = sigmas[i]
+            sigma_next = sigmas[i + 1]
+            t = mx.array([timesteps_list[i]])
+            # Scale latents before transformer
+            latents_scaled = self.scheduler.scale_latents(latents, sigma)
+            v_pred = _forward_fn(
+                latents_scaled, img_ids,
+                t5_embed, txt_ids,
+                clip_pooled, t,
+            )
+            mx.eval(v_pred)
+            # Euler step: dt = sigma_next - sigma
+            latents = self.scheduler.step(v_pred, sigma, sigma_next, latents)
+            mx.eval(latents)
+            if progress_callback:
+                progress_callback(i + 1, steps)
+            logger.debug("[MLX] Step %d/%d (sigma=%.4f)", i + 1, steps, sigma)
+        logger.info("[MLX] Denoising done (%.1fs)", time.time() - t1)
+        # ── Phase 2→3 transition: free DiT to make room for VAE ─────────
+        if self.phased_memory:
+            logger.info("[MLX] Releasing DiT to free memory...")
+            self.transformer = None
+            mx.clear_cache()
+        # ── Phase 3: VAE decode ─────────────────────────────────────────
+        logger.info("[MLX] Phase 3: VAE decode...")
+        t2 = time.time()
+        z = unpatchify(latents, lat_h, lat_w)  # [1, lat_h, lat_w, 16]
+        image = self.vae.decode(z)              # [1, H, W, 3]
+        mx.eval(image)
+        logger.info("[MLX] VAE decode done (%.1fs)", time.time() - t2)
+        # ── Convert to PIL ──────────────────────────────────────────────
+        import numpy as np
+        img_np = np.array(image[0], copy=False)  # [H, W, 3]
+        img_np = (img_np * 255).clip(0, 255).astype(np.uint8)
+        return Image.fromarray(img_np)
+    # ── Memory management ────────────────────────────────────────────────
+    def unload(self) -> None:
+        """Free all model components from memory."""
+        self.t5_encoder = None
+        self.clip_encoder = None
+        self.transformer = None
+        self.vae = None
+        self._loaded = False
+        try:
+            mx.clear_cache()
+        except Exception:
+            pass
+        logger.info("[MLX] Pipeline unloaded")
+    def memory_info(self) -> dict:
+        """Return MLX Metal memory usage info."""
+        try:
+            peak = mx.get_peak_memory() / (1024 ** 3)
+            active = mx.get_active_memory() / (1024 ** 3)
+            cache = mx.get_cache_memory() / (1024 ** 3)
+            return {
+                "peak_gb": round(peak, 2),
+                "active_gb": round(active, 2),
+                "cache_gb": round(cache, 2),
+            }
+        except AttributeError:
+            # Fallback for older MLX with mx.metal.* API
+            try:
+                peak = mx.metal.get_peak_memory() / (1024 ** 3)
+                active = mx.metal.get_active_memory() / (1024 ** 3)
+                cache = mx.metal.get_cache_memory() / (1024 ** 3)
+                return {
+                    "peak_gb": round(peak, 2),
+                    "active_gb": round(active, 2),
+                    "cache_gb": round(cache, 2),
+                }
+            except Exception:
+                return {}
+        except Exception:
+            return {}

sampler.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""Flow-matching Euler scheduler + latent patchify/unpatchify.
+Matches mflux FlowMatchEulerDiscreteScheduler implementation.
+"""
+from __future__ import annotations
+import math
+import mlx.core as mx
+class FlowMatchEulerScheduler:
+    """Rectified flow matching scheduler with exponential time shift.
+    Matches mflux's FlowMatchEulerDiscreteScheduler exactly.
+    """
+    def compute_sigmas(self, num_steps: int, image_seq_len: int = 256) -> tuple[list[float], list[float]]:
+        """Compute sigma schedule with exponential time shift.
+        Returns (timesteps, sigmas) where:
+        - timesteps: sigma * 1000 (for transformer input)
+        - sigmas: actual sigma values including terminal 0.0
+        """
+        # Linear base sigmas
+        sigmas = mx.linspace(1.0, 1.0 / num_steps, num_steps, dtype=mx.float32)
+        # Exponential time shift (mflux convention)
+        mu = self._compute_mu(image_seq_len, num_steps)
+        sigmas = self._time_shift(mu, 1.0, sigmas)
+        timesteps = (sigmas * 1000).tolist()
+        sigmas_list = sigmas.tolist() + [0.0]
+        return timesteps, sigmas_list
+    @staticmethod
+    def _compute_mu(image_seq_len: int, num_steps: int) -> float:
+        """Empirical mu for time shift. Matches mflux."""
+        # mu = 0.5 * log(image_seq_len) — mflux empirical formula
+        return 0.5 * math.log(image_seq_len)
+    @staticmethod
+    def _time_shift(mu: float, sigma_max: float, sigmas: mx.array) -> mx.array:
+        """Exponential interpolation time shift."""
+        return mx.exp(mu) / (mx.exp(mu) + (1 / sigmas - 1))
+    def scale_latents(self, latents: mx.array, sigma: float) -> mx.array:
+        """Scale latents before transformer input: x / sqrt(σ² + 1)."""
+        return latents / ((sigma ** 2 + 1) ** 0.5)
+    def step(
+        self,
+        velocity: mx.array,
+        sigma_cur: float,
+        sigma_next: float,
+        latents: mx.array,
+    ) -> mx.array:
+        """One Euler step in the flow ODE.
+        Args:
+            velocity: v_θ prediction [B, seq, D]
+            sigma_cur: current sigma level
+            sigma_next: next sigma level
+            latents: current latent state [B, seq, D]
+        Returns:
+            Updated latents.
+        """
+        dt = mx.array(sigma_cur - sigma_next, dtype=latents.dtype)
+        return latents + dt * velocity
+def patchify(latents: mx.array) -> mx.array:
+    """Convert spatial latents to patch tokens.
+    [B, H, W, C] → [B, (H/2)×(W/2), C×4]
+    FLUX uses 2×2 patches over the latent space, flattening each
+    2×2×C patch into a single C*4 token.
+    """
+    B, H, W, C = latents.shape
+    latents = latents.reshape(B, H // 2, 2, W // 2, 2, C)
+    latents = latents.transpose(0, 1, 3, 2, 4, 5)  # [B, H/2, W/2, 2, 2, C]
+    return latents.reshape(B, (H // 2) * (W // 2), C * 4)  # [B, seq, 64]
+def unpatchify(tokens: mx.array, h: int, w: int) -> mx.array:
+    """Convert patch tokens back to spatial latents.
+    [B, seq, C*4] → [B, H, W, C]
+    Args:
+        tokens: [B, (h/2)*(w/2), 64]
+        h, w: latent spatial dimensions (before patching)
+    """
+    B = tokens.shape[0]
+    C = tokens.shape[-1] // 4  # 64 / 4 = 16
+    tokens = tokens.reshape(B, h // 2, w // 2, 2, 2, C)
+    tokens = tokens.transpose(0, 1, 3, 2, 4, 5)  # [B, H, W, C]
+    return tokens.reshape(B, h, w, C)
+def compute_img_ids(lat_h: int, lat_w: int) -> mx.array:
+    """Compute position IDs for patchified latent tokens.
+    Returns [1, (lat_h/2)*(lat_w/2), 3] with (time=0, h_pos, w_pos).
+    """
+    h_ids = mx.arange(lat_h // 2)
+    w_ids = mx.arange(lat_w // 2)
+    # Meshgrid: [lat_h//2, lat_w//2]
+    h_grid = mx.repeat(h_ids[:, None], lat_w // 2, axis=1)
+    w_grid = mx.repeat(w_ids[None, :], lat_h // 2, axis=0)
+    # Flatten and stack with time=0
+    seq_len = (lat_h // 2) * (lat_w // 2)
+    t_ids = mx.zeros((seq_len,), dtype=mx.int32)
+    h_flat = h_grid.reshape(-1)
+    w_flat = w_grid.reshape(-1)
+    ids = mx.stack([t_ids, h_flat, w_flat], axis=-1)  # [seq, 3]
+    return ids.reshape(1, seq_len, 3)  # [1, seq, 3]

t5_encoder.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""T5-XXL text encoder for FLUX pipeline.
+Implements a 24-layer T5 encoder with relative position bias,
+gated FFN (GeLU), and RMS LayerNorm — matching the HuggingFace
+``google/t5-xxl`` architecture used by FLUX.1.
+Weight source: ``black-forest-labs/FLUX.1-schnell`` →
+``text_encoder_2/model-0000{1,2}-of-00002.safetensors``
+"""
+from __future__ import annotations
+import math
+import mlx.core as mx
+import mlx.nn as nn
+# ── T5 Config (XXL) ──────────────────────────────────────────────────────────
+class T5Config:
+    vocab_size: int = 32128
+    d_model: int = 4096
+    d_ff: int = 10240
+    num_heads: int = 64
+    head_dim: int = 64  # d_model / num_heads
+    num_layers: int = 24
+    relative_attention_num_buckets: int = 32
+    relative_attention_max_distance: int = 128
+# ── Building blocks ──────────────────────────────────────────────────────────
+class T5RMSNorm(nn.Module):
+    """T5-style RMS LayerNorm (no bias, no mean subtraction)."""
+    def __init__(self, d: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = mx.ones((d,))
+        self.eps = eps
+    def __call__(self, x: mx.array) -> mx.array:
+        variance = mx.mean(x * x, axis=-1, keepdims=True)
+        x = x * mx.rsqrt(variance + self.eps)
+        return x * self.weight
+class T5RelativeAttention(nn.Module):
+    """Multi-head attention with T5 relative position bias."""
+    def __init__(self, cfg: T5Config, has_relative_bias: bool = False):
+        super().__init__()
+        self.num_heads = cfg.num_heads
+        self.head_dim = cfg.head_dim
+        d = cfg.d_model
+        self.q_proj = nn.Linear(d, d, bias=False)
+        self.k_proj = nn.Linear(d, d, bias=False)
+        self.v_proj = nn.Linear(d, d, bias=False)
+        self.out_proj = nn.Linear(d, d, bias=False)
+        self.has_relative_bias = has_relative_bias
+        if has_relative_bias:
+            self.rel_bias = nn.Embedding(
+                cfg.relative_attention_num_buckets, cfg.num_heads,
+            )
+        self._num_buckets = cfg.relative_attention_num_buckets
+        self._max_distance = cfg.relative_attention_max_distance
+    @staticmethod
+    def _relative_position_bucket(
+        relative_position: mx.array,
+        num_buckets: int = 32,
+        max_distance: int = 128,
+    ) -> mx.array:
+        """T5-style relative position bucketing.
+        Maps relative positions to bucket indices for the learned bias.
+        Bidirectional: first half for negative, second for positive.
+        """
+        # Bidirectional: use half the buckets for each direction
+        num_buckets //= 2
+        # Sign-based offset
+        relative_buckets = (relative_position > 0).astype(mx.int32) * num_buckets
+        relative_position = mx.abs(relative_position)
+        # Small positions are mapped linearly
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+        # Larger positions are mapped logarithmically
+        val = mx.log(relative_position.astype(mx.float32) / max_exact) / math.log(
+            max_distance / max_exact
+        )
+        val = val * (num_buckets - max_exact)
+        relative_position_if_large = (max_exact + val).astype(mx.int32)
+        relative_position_if_large = mx.minimum(
+            relative_position_if_large,
+            mx.array(num_buckets - 1, dtype=mx.int32),
+        )
+        relative_buckets = relative_buckets + mx.where(
+            is_small,
+            relative_position.astype(mx.int32),
+            relative_position_if_large,
+        )
+        return relative_buckets
+    def _compute_bias(self, seq_len: int) -> mx.array:
+        """Compute relative position bias [1, num_heads, seq, seq]."""
+        positions = mx.arange(seq_len)
+        # relative_position[i, j] = j - i
+        relative_position = positions[None, :] - positions[:, None]
+        buckets = self._relative_position_bucket(
+            relative_position,
+            num_buckets=self._num_buckets,
+            max_distance=self._max_distance,
+        )
+        # [seq, seq] → lookup → [seq, seq, num_heads]
+        bias = self.rel_bias(buckets)
+        # → [1, num_heads, seq, seq]
+        bias = bias.transpose(2, 0, 1).reshape(1, self.num_heads, seq_len, seq_len)
+        return bias
+    def __call__(
+        self, x: mx.array, position_bias: mx.array | None = None,
+    ) -> tuple[mx.array, mx.array | None]:
+        B, L, _ = x.shape
+        H, D = self.num_heads, self.head_dim
+        q = self.q_proj(x).reshape(B, L, H, D).transpose(0, 2, 1, 3)
+        k = self.k_proj(x).reshape(B, L, H, D).transpose(0, 2, 1, 3)
+        v = self.v_proj(x).reshape(B, L, H, D).transpose(0, 2, 1, 3)
+        # Scaled dot-product attention
+        scale = math.sqrt(D)
+        scores = (q @ k.transpose(0, 1, 3, 2)) / scale  # [B, H, L, L]
+        # Add relative position bias
+        if self.has_relative_bias:
+            position_bias = self._compute_bias(L)
+        if position_bias is not None:
+            scores = scores + position_bias
+        weights = mx.softmax(scores, axis=-1)
+        out = weights @ v  # [B, H, L, D]
+        out = out.transpose(0, 2, 1, 3).reshape(B, L, -1)  # [B, L, d_model]
+        return self.out_proj(out), position_bias
+class T5GatedFFN(nn.Module):
+    """T5 Gated Feed-Forward Network (GeLU gate)."""
+    def __init__(self, cfg: T5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)  # gate
+        self.wi_1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)  # value
+        self.wo = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+    def __call__(self, x: mx.array) -> mx.array:
+        gate = nn.gelu_approx(self.wi_0(x))
+        value = self.wi_1(x)
+        return self.wo(gate * value)
+class T5EncoderLayer(nn.Module):
+    """Single T5 encoder layer: Norm → Attention → Norm → FFN."""
+    def __init__(self, cfg: T5Config, has_relative_bias: bool = False):
+        super().__init__()
+        self.attn_norm = T5RMSNorm(cfg.d_model)
+        self.attn = T5RelativeAttention(cfg, has_relative_bias=has_relative_bias)
+        self.ffn_norm = T5RMSNorm(cfg.d_model)
+        self.ffn = T5GatedFFN(cfg)
+    def __call__(
+        self, x: mx.array, position_bias: mx.array | None = None,
+    ) -> tuple[mx.array, mx.array | None]:
+        # Pre-norm attention
+        residual = x
+        x = self.attn_norm(x)
+        x, position_bias = self.attn(x, position_bias)
+        x = residual + x
+        # Pre-norm FFN
+        residual = x
+        x = self.ffn_norm(x)
+        x = self.ffn(x)
+        x = residual + x
+        return x, position_bias
+# ── T5 Encoder ───────────────────────────────────────────────────────────────
+class T5Encoder(nn.Module):
+    """T5-XXL encoder: 24-layer transformer with relative position bias.
+    Input:  token_ids [B, seq_len]
+    Output: embeddings [B, seq_len, 4096]
+    """
+    def __init__(self, cfg: T5Config | None = None):
+        super().__init__()
+        if cfg is None:
+            cfg = T5Config()
+        self.cfg = cfg
+        self.wte = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.layers = [
+            T5EncoderLayer(cfg, has_relative_bias=(i == 0))
+            for i in range(cfg.num_layers)
+        ]
+        self.final_norm = T5RMSNorm(cfg.d_model)
+    def __call__(self, token_ids: mx.array, use_transformer: bool = True) -> mx.array:
+        x = self.wte(token_ids)  # [B, L, d_model]
+        if use_transformer:
+            # Full 24-layer transformer (requires correct numerical implementation)
+            position_bias = None
+            for layer in self.layers:
+                x, position_bias = layer(x, position_bias)
+        x = self.final_norm(x)
+        return x

tokenizers.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Tokenizers for FLUX pipeline — T5 (SentencePiece) and CLIP (BPE).
+Both tokenizers produce mx.array token ID tensors ready for encoder input.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+import mlx.core as mx
+logger = logging.getLogger("image-server")
+class T5Tokenizer:
+    """T5-XXL SentencePiece tokenizer.
+    Loads ``spiece.model`` from the FLUX.1-schnell repo
+    (``tokenizer_2/spiece.model``).
+    """
+    def __init__(self, spiece_path: str, max_length: int = 256):
+        import sentencepiece as spm
+        self._sp = spm.SentencePieceProcessor()
+        self._sp.Load(spiece_path)
+        self.max_length = max_length
+        self.pad_id = 0
+    def tokenize(self, text: str) -> mx.array:
+        """Tokenize text → [1, max_length] int32 tensor."""
+        ids = self._sp.Encode(text)
+        # Truncate
+        if len(ids) > self.max_length:
+            ids = ids[: self.max_length]
+        # Pad
+        pad_len = self.max_length - len(ids)
+        if pad_len > 0:
+            ids = ids + [self.pad_id] * pad_len
+        return mx.array(ids, dtype=mx.int32).reshape(1, -1)
+class CLIPTokenizer:
+    """CLIP BPE tokenizer.
+    Loads ``vocab.json`` (token→id) and ``merges.txt`` (BPE merge rules)
+    from ``tokenizer/`` in the FLUX.1-schnell repo.
+    """
+    BOS_ID = 49406  # <|startoftext|>
+    EOS_ID = 49407  # <|endoftext|>
+    def __init__(self, vocab_path: str, merges_path: str, max_length: int = 77):
+        # Load vocab: token_str → id
+        with open(vocab_path, encoding="utf-8") as f:
+            self._vocab: dict[str, int] = json.load(f)
+        # Load BPE merges from merges.txt
+        self._merges: list[tuple[str, str]] = []
+        self._merge_rank: dict[tuple[str, str], int] = {}
+        with open(merges_path, encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                parts = line.split()
+                if len(parts) == 2:
+                    pair = (parts[0], parts[1])
+                    self._merges.append(pair)
+                    self._merge_rank[pair] = i
+        self.max_length = max_length
+        self.pad_id = 0
+        # pre/post processing regex (simplified CLIP pattern)
+        import regex
+        self._pat = regex.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d|"""
+            r"""[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            regex.IGNORECASE,
+        )
+    def _bpe(self, token: str) -> list[str]:
+        """Apply BPE merges to a single word token."""
+        if len(token) <= 1:
+            return [token + "</w>"] if token else []
+        # Add end-of-word marker
+        word = list(token[:-1]) + [token[-1] + "</w>"]
+        while len(word) > 1:
+            # Find the highest-priority merge pair
+            best_pair = None
+            best_rank = float("inf")
+            for i in range(len(word) - 1):
+                pair = (word[i], word[i + 1])
+                rank = self._merge_rank.get(pair, float("inf"))
+                if rank < best_rank:
+                    best_rank = rank
+                    best_pair = pair
+            if best_pair is None or best_rank == float("inf"):
+                break
+            # Apply the merge
+            new_word = []
+            i = 0
+            while i < len(word):
+                if (
+                    i < len(word) - 1
+                    and word[i] == best_pair[0]
+                    and word[i + 1] == best_pair[1]
+                ):
+                    new_word.append(best_pair[0] + best_pair[1])
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            word = new_word
+        return word
+    def tokenize(self, text: str) -> mx.array:
+        """Tokenize text → [1, max_length] int32 tensor."""
+        text = text.lower().strip()
+        ids = [self.BOS_ID]
+        # Tokenize each word
+        for match in self._pat.finditer(text):
+            word = match.group()
+            bpe_tokens = self._bpe(word)
+            for bt in bpe_tokens:
+                token_id = self._vocab.get(bt, 0)
+                ids.append(token_id)
+        ids.append(self.EOS_ID)
+        # Truncate (keep BOS at start, EOS at end)
+        if len(ids) > self.max_length:
+            ids = ids[: self.max_length - 1] + [self.EOS_ID]
+        # Pad
+        pad_len = self.max_length - len(ids)
+        if pad_len > 0:
+            ids = ids + [self.pad_id] * pad_len
+        return mx.array(ids, dtype=mx.int32).reshape(1, -1)

weight_loader.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Weight loader — safetensors → MLX model parameter mapping.
+Handles multi-shard loading, HuggingFace key → self-defined key mapping,
+and dtype conversion.  Each ``load_*`` function takes a model instance
+and populates its parameters in-place.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import mlx.core as mx
+import mlx.nn as nn
+logger = logging.getLogger("image-server")
+# ── Utilities ────────────────────────────────────────────────────────────────
+def _load_safetensors(*paths: str) -> dict[str, mx.array]:
+    """Load one or more safetensors files into a flat dict.
+    Tries MLX framework first. Falls back to PyTorch for bfloat16 weights,
+    preserving uint32 quantized weights as-is.
+    """
+    from safetensors import safe_open
+    weights: dict[str, mx.array] = {}
+    for p in paths:
+        # Try mlx framework first (fastest)
+        try:
+            with safe_open(p, framework="mlx") as f:
+                for key in f.keys():
+                    weights[key] = f.get_tensor(key)
+            logger.info("[WeightLoader] Loaded %s via MLX framework", p.split("/")[-1])
+            continue
+        except Exception:
+            pass  # bfloat16 or other incompatibility → fallback to pt
+        # Fallback: load via PyTorch, selectively convert
+        try:
+            import torch
+            with safe_open(p, framework="pt") as f:
+                for key in f.keys():
+                    t = f.get_tensor(key)
+                    if t.dtype == torch.uint32:
+                        # Quantized weight — keep as uint32
+                        weights[key] = mx.array(t.numpy())
+                    elif t.dtype == torch.bfloat16:
+                        # bfloat16 → float32 → mx (bfloat16 not supported by numpy)
+                        weights[key] = mx.array(t.float().numpy())
+                    else:
+                        # float32, float16, etc. — direct conversion
+                        weights[key] = mx.array(t.numpy())
+            logger.info("[WeightLoader] Loaded %s via PyTorch fallback (%d tensors)", p.split("/")[-1], len(weights))
+        except Exception as exc:
+            logger.error("[WeightLoader] Failed to load %s: %s", p, exc)
+    return weights
+def _set_nested(model: nn.Module, dotpath: str, value: mx.array) -> bool:
+    """Set a parameter on *model* using a dot-separated path.
+    Returns True if set successfully, False if path does not exist.
+    """
+    parts = dotpath.split(".")
+    obj = model
+    for part in parts[:-1]:
+        if part.isdigit():
+            obj = obj[int(part)]
+        elif hasattr(obj, part):
+            obj = getattr(obj, part)
+        else:
+            return False
+    final = parts[-1]
+    if hasattr(obj, final):
+        setattr(obj, final, value)
+        return True
+    return False
+# ── T5 Weight Loader ─────────────────────────────────────────────────────────
+# HuggingFace T5 key → our T5Encoder parameter path
+_T5_KEY_MAP_TEMPLATES = {
+    # Embedding
+    "encoder.embed_tokens.weight": "wte.weight",
+    "shared.weight": "wte.weight",  # some checkpoints use this key
+    # Per-layer keys (use {i} placeholder)
+    "encoder.block.{i}.layer.0.layer_norm.weight": "layers.{i}.attn_norm.weight",
+    "encoder.block.{i}.layer.0.SelfAttention.q.weight": "layers.{i}.attn.q_proj.weight",
+    "encoder.block.{i}.layer.0.SelfAttention.k.weight": "layers.{i}.attn.k_proj.weight",
+    "encoder.block.{i}.layer.0.SelfAttention.v.weight": "layers.{i}.attn.v_proj.weight",
+    "encoder.block.{i}.layer.0.SelfAttention.o.weight": "layers.{i}.attn.out_proj.weight",
+    "encoder.block.{i}.layer.1.layer_norm.weight": "layers.{i}.ffn_norm.weight",
+    "encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight": "layers.{i}.ffn.wi_0.weight",
+    "encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight": "layers.{i}.ffn.wi_1.weight",
+    "encoder.block.{i}.layer.1.DenseReluDense.wo.weight": "layers.{i}.ffn.wo.weight",
+    # Relative attention bias (only layer 0)
+    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight":
+        "layers.0.attn.rel_bias.weight",
+    # Final norm
+    "encoder.final_layer_norm.weight": "final_norm.weight",
+}
+def _build_t5_key_map(num_layers: int = 24) -> dict[str, str]:
+    """Expand layer-templated keys for all layers."""
+    mapping: dict[str, str] = {}
+    for hf_template, our_template in _T5_KEY_MAP_TEMPLATES.items():
+        if "{i}" in hf_template:
+            for i in range(num_layers):
+                hf_key = hf_template.replace("{i}", str(i))
+                our_key = our_template.replace("{i}", str(i))
+                mapping[hf_key] = our_key
+        else:
+            mapping[hf_template] = our_template
+    return mapping
+def load_t5(paths: list[str], model) -> None:
+    """Load T5 encoder weights from safetensors into a T5Encoder model."""
+    weights = _load_safetensors(*paths)
+    key_map = _build_t5_key_map(num_layers=model.cfg.num_layers)
+    loaded = 0
+    unmapped = []
+    for hf_key, tensor in weights.items():
+        our_key = key_map.get(hf_key)
+        if our_key is None:
+            unmapped.append(hf_key)
+            continue
+        # T5 Linear weights are stored transposed in HF format:
+        # HF shape [out, in] but nn.Linear expects [out, in] too in MLX
+        # (MLX Linear does x @ W.T, so weight shape = [out, in])
+        if _set_nested(model, our_key, tensor):
+            loaded += 1
+        else:
+            unmapped.append(f"{hf_key} → {our_key} (path not found)")
+    logger.info(
+        "[WeightLoader] T5: loaded %d/%d params, unmapped %d",
+        loaded, len(weights), len(unmapped),
+    )
+    if unmapped:
+        for k in unmapped[:10]:
+            logger.debug("  unmapped: %s", k)
+        if len(unmapped) > 10:
+            logger.debug("  ... and %d more", len(unmapped) - 10)
+# ── CLIP Weight Loader ───────────────────────────────────────────────────────
+_CLIP_KEY_MAP_TEMPLATES = {
+    # Embeddings
+    "text_model.embeddings.token_embedding.weight": "token_emb.weight",
+    "text_model.embeddings.position_embedding.weight": "pos_emb.weight",
+    # Per-layer keys
+    "text_model.encoder.layers.{i}.layer_norm1.weight": "layers.{i}.norm1.weight",
+    "text_model.encoder.layers.{i}.layer_norm1.bias": "layers.{i}.norm1.bias",
+    "text_model.encoder.layers.{i}.self_attn.q_proj.weight": "layers.{i}.attn.q_proj.weight",
+    "text_model.encoder.layers.{i}.self_attn.q_proj.bias": "layers.{i}.attn.q_proj.bias",
+    "text_model.encoder.layers.{i}.self_attn.k_proj.weight": "layers.{i}.attn.k_proj.weight",
+    "text_model.encoder.layers.{i}.self_attn.k_proj.bias": "layers.{i}.attn.k_proj.bias",
+    "text_model.encoder.layers.{i}.self_attn.v_proj.weight": "layers.{i}.attn.v_proj.weight",
+    "text_model.encoder.layers.{i}.self_attn.v_proj.bias": "layers.{i}.attn.v_proj.bias",
+    "text_model.encoder.layers.{i}.self_attn.out_proj.weight": "layers.{i}.attn.out_proj.weight",
+    "text_model.encoder.layers.{i}.self_attn.out_proj.bias": "layers.{i}.attn.out_proj.bias",
+    "text_model.encoder.layers.{i}.layer_norm2.weight": "layers.{i}.norm2.weight",
+    "text_model.encoder.layers.{i}.layer_norm2.bias": "layers.{i}.norm2.bias",
+    "text_model.encoder.layers.{i}.mlp.fc1.weight": "layers.{i}.mlp.fc1.weight",
+    "text_model.encoder.layers.{i}.mlp.fc1.bias": "layers.{i}.mlp.fc1.bias",
+    "text_model.encoder.layers.{i}.mlp.fc2.weight": "layers.{i}.mlp.fc2.weight",
+    "text_model.encoder.layers.{i}.mlp.fc2.bias": "layers.{i}.mlp.fc2.bias",
+    # Final norm
+    "text_model.final_layer_norm.weight": "final_norm.weight",
+    "text_model.final_layer_norm.bias": "final_norm.bias",
+}
+def _build_clip_key_map(num_layers: int = 23) -> dict[str, str]:
+    """Expand layer-templated keys for all layers."""
+    mapping: dict[str, str] = {}
+    for hf_template, our_template in _CLIP_KEY_MAP_TEMPLATES.items():
+        if "{i}" in hf_template:
+            for i in range(num_layers):
+                hf_key = hf_template.replace("{i}", str(i))
+                our_key = our_template.replace("{i}", str(i))
+                mapping[hf_key] = our_key
+        else:
+            mapping[hf_template] = our_template
+    return mapping
+def load_clip(path: str, model) -> None:
+    """Load CLIP encoder weights from safetensors into a CLIPEncoder model."""
+    weights = _load_safetensors(path)
+    key_map = _build_clip_key_map(num_layers=model.cfg.num_layers)
+    loaded = 0
+    unmapped = []
+    for hf_key, tensor in weights.items():
+        our_key = key_map.get(hf_key)
+        if our_key is None:
+            unmapped.append(hf_key)
+            continue
+        if _set_nested(model, our_key, tensor):
+            loaded += 1
+        else:
+            unmapped.append(f"{hf_key} → {our_key} (path not found)")
+    logger.info(
+        "[WeightLoader] CLIP: loaded %d/%d params, unmapped %d",
+        loaded, len(weights), len(unmapped),
+    )
+    if unmapped:
+        for k in unmapped[:10]:
+            logger.debug("  unmapped: %s", k)
+# ── Placeholder for future rounds ────────────────────────────────────────────
+def load_flux_dit(path: str, model) -> None:
+    """Load FLUX DiT weights. Implemented in C-122b."""
+    raise NotImplementedError("FLUX DiT weight loading — see C-122b")
+def load_vae(path: str, model) -> None:
+    """Load VAE decoder weights. Implemented in C-122c."""
+    raise NotImplementedError("VAE weight loading — see C-122c")