Spaces:

AbstractPhil
/

tinyflux-lailah

Running on Zero

App Files Files Community

AbstractPhil commited on 25 days ago

Commit

a29d3c5

verified ·

1 Parent(s): 5086269

Update app.py

Browse files

Files changed (1) hide show

app.py +295 -305

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 TinyFlux-Lailah Gradio Demo
 HuggingFace Spaces with ZeroGPU support
 """
 import gradio as gr
@@ -8,19 +9,20 @@ import numpy as np
 import random
 import spaces
 import torch
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from transformers import T5EncoderModel, T5Tokenizer, CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL
 from PIL import Image
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from dataclasses import dataclass
-from typing import Tuple
 # ============================================================================
-# MODEL DEFINITION (TinyFluxDeep / Lailah)
 # ============================================================================
 @dataclass
@@ -40,23 +42,29 @@ class TinyFluxDeepConfig:
     def __post_init__(self):
         assert self.num_attention_heads * self.attention_head_dim == self.hidden_size
 class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-6):
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(dim))
         self.eps = eps
-    def forward(self, x):
-        dtype = x.dtype
-        x = x.float()
-        norm = x.pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
-        return (x * norm).to(dtype) * self.weight
 class EmbedND(nn.Module):
-    def __init__(self, theta=10000.0, axes_dim=(16, 56, 56)):
         super().__init__()
         self.theta = theta
         self.axes_dim = axes_dim
@@ -64,231 +72,263 @@ class EmbedND(nn.Module):
             freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
             self.register_buffer(f'freqs_{i}', freqs, persistent=True)
-    def forward(self, ids):
-        rope_components = []
-        for i, dim in enumerate(self.axes_dim):
-            freqs = getattr(self, f'freqs_{i}').to(ids.device)
-            axis_ids = ids[..., i:i+1]
-            angles = axis_ids * freqs
-            cos = torch.cos(angles)
-            sin = torch.sin(angles)
-            interleaved = torch.stack([cos, sin], dim=-1).flatten(-2)
-            rope_components.append(interleaved)
-        return torch.cat(rope_components, dim=-1)
-def apply_rope(x, rope):
-    B, H, N, D = x.shape
-    rope = rope[:, :N, :D]
-    rope = rope.unsqueeze(1)
-    x_pairs = x.reshape(B, H, N, D // 2, 2)
-    cos = rope[..., 0::2]
-    sin = rope[..., 1::2]
-    x_rot = torch.stack([
-        x_pairs[..., 0] * cos - x_pairs[..., 1] * sin,
-        x_pairs[..., 1] * cos + x_pairs[..., 0] * sin,
-    ], dim=-1)
-    return x_rot.flatten(-2)
 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim, hidden_dim):
         super().__init__()
-        self.fc1 = nn.Linear(in_dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
-    def forward(self, x):
-        return self.fc2(F.silu(self.fc1(x)))
-class QKNorm(nn.Module):
-    def __init__(self, dim):
         super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
-    def forward(self, q, k):
-        return self.query_norm(q), self.key_norm(k)
-class DoubleAttention(nn.Module):
-    def __init__(self, hidden_size, num_heads, head_dim):
         super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        qkv_dim = num_heads * head_dim * 3
-        self.img_qkv = nn.Linear(hidden_size, qkv_dim, bias=True)
-        self.img_out = nn.Linear(num_heads * head_dim, hidden_size, bias=True)
-        self.txt_qkv = nn.Linear(hidden_size, qkv_dim, bias=True)
-        self.txt_out = nn.Linear(num_heads * head_dim, hidden_size, bias=True)
-        self.img_norm = QKNorm(head_dim)
-        self.txt_norm = QKNorm(head_dim)
-    def forward(self, img, txt, rope):
-        B, N_img, _ = img.shape
-        N_txt = txt.shape[1]
-        img_qkv = self.img_qkv(img).reshape(B, N_img, 3, self.num_heads, self.head_dim)
-        img_q, img_k, img_v = img_qkv.permute(2, 0, 3, 1, 4).unbind(0)
-        img_q, img_k = self.img_norm(img_q, img_k)
-        img_q = apply_rope(img_q, rope)
-        img_k = apply_rope(img_k, rope)
-        txt_qkv = self.txt_qkv(txt).reshape(B, N_txt, 3, self.num_heads, self.head_dim)
-        txt_q, txt_k, txt_v = txt_qkv.permute(2, 0, 3, 1, 4).unbind(0)
-        txt_q, txt_k = self.txt_norm(txt_q, txt_k)
-        q = torch.cat([txt_q, img_q], dim=2)
-        k = torch.cat([txt_k, img_k], dim=2)
-        v = torch.cat([txt_v, img_v], dim=2)
-        attn_out = F.scaled_dot_product_attention(q, k, v)
-        txt_out, img_out = attn_out.split([N_txt, N_img], dim=2)
-        img_out = img_out.transpose(1, 2).reshape(B, N_img, -1)
-        txt_out = txt_out.transpose(1, 2).reshape(B, N_txt, -1)
-        return self.img_out(img_out), self.txt_out(txt_out)
-class DoubleBlock(nn.Module):
-    def __init__(self, hidden_size, num_heads, head_dim, mlp_ratio=4.0):
-        super().__init__()
-        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_mod = nn.Linear(hidden_size, hidden_size * 6, bias=True)
-        self.txt_mod = nn.Linear(hidden_size, hidden_size * 6, bias=True)
-        self.attn = DoubleAttention(hidden_size, num_heads, head_dim)
-        mlp_hidden = int(hidden_size * mlp_ratio)
-        self.img_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden, hidden_size, bias=True),
-        )
-        self.txt_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden, hidden_size, bias=True),
-        )
-    def forward(self, img, txt, cond, rope):
-        img_mod = self.img_mod(cond)
-        img_scale1, img_shift1, img_gate1, img_scale2, img_shift2, img_gate2 = img_mod.chunk(6, dim=-1)
-        txt_mod = self.txt_mod(cond)
-        txt_scale1, txt_shift1, txt_gate1, txt_scale2, txt_shift2, txt_gate2 = txt_mod.chunk(6, dim=-1)
-        img_normed = self.img_norm1(img) * (1 + img_scale1.unsqueeze(1)) + img_shift1.unsqueeze(1)
-        txt_normed = self.txt_norm1(txt) * (1 + txt_scale1.unsqueeze(1)) + txt_shift1.unsqueeze(1)
-        img_attn, txt_attn = self.attn(img_normed, txt_normed, rope)
-        img = img + img_gate1.unsqueeze(1) * img_attn
-        txt = txt + txt_gate1.unsqueeze(1) * txt_attn
-        img_normed2 = self.img_norm2(img) * (1 + img_scale2.unsqueeze(1)) + img_shift2.unsqueeze(1)
-        txt_normed2 = self.txt_norm2(txt) * (1 + txt_scale2.unsqueeze(1)) + txt_shift2.unsqueeze(1)
-        img = img + img_gate2.unsqueeze(1) * self.img_mlp(img_normed2)
-        txt = txt + txt_gate2.unsqueeze(1) * self.txt_mlp(txt_normed2)
-        return img, txt
-class SingleAttention(nn.Module):
-    def __init__(self, hidden_size, num_heads, head_dim):
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = head_dim
-        self.qkv = nn.Linear(hidden_size, num_heads * head_dim * 3, bias=True)
-        self.norm = QKNorm(head_dim)
-    def forward(self, x, rope):
         B, N, _ = x.shape
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
-        q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0)
-        q, k = self.norm(q, k)
-        q = apply_rope(q, rope)
-        k = apply_rope(k, rope)
-        out = F.scaled_dot_product_attention(q, k, v)
-        return out.transpose(1, 2).reshape(B, N, -1)
-class SingleBlock(nn.Module):
-    def __init__(self, hidden_size, num_heads, head_dim, mlp_ratio=4.0):
         super().__init__()
-        self.norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mod = nn.Linear(hidden_size, hidden_size * 3, bias=True)
-        self.attn = SingleAttention(hidden_size, num_heads, head_dim)
-        self.proj = nn.Linear(num_heads * head_dim, hidden_size, bias=True)
         mlp_hidden = int(hidden_size * mlp_ratio)
-        self.mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden, hidden_size, bias=True),
-        )
-    def forward(self, x, cond, rope):
-        mod = self.mod(cond)
-        scale, shift, gate = mod.chunk(3, dim=-1)
-        normed = self.norm(x) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-        attn_out = self.proj(self.attn(normed, rope))
-        mlp_out = self.mlp(normed)
-        return x + gate.unsqueeze(1) * (attn_out + mlp_out)
 class TinyFluxDeep(nn.Module):
-    def __init__(self, cfg: TinyFluxDeepConfig):
         super().__init__()
-        self.cfg = cfg
         self.img_in = nn.Linear(cfg.in_channels, cfg.hidden_size, bias=True)
         self.txt_in = nn.Linear(cfg.joint_attention_dim, cfg.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(256, cfg.hidden_size)
-        self.guidance_in = MLPEmbedder(256, cfg.hidden_size)
-        self.vector_in = MLPEmbedder(cfg.pooled_projection_dim, cfg.hidden_size)
-        self.rope = EmbedND(axes_dim=cfg.axes_dims_rope)
         self.double_blocks = nn.ModuleList([
-            DoubleBlock(cfg.hidden_size, cfg.num_attention_heads, cfg.attention_head_dim, cfg.mlp_ratio)
-            for _ in range(cfg.num_double_layers)
         ])
         self.single_blocks = nn.ModuleList([
-            SingleBlock(cfg.hidden_size, cfg.num_attention_heads, cfg.attention_head_dim, cfg.mlp_ratio)
-            for _ in range(cfg.num_single_layers)
         ])
-        self.final_norm = nn.LayerNorm(cfg.hidden_size, elementwise_affine=False, eps=1e-6)
-        self.final_mod = nn.Linear(cfg.hidden_size, cfg.hidden_size * 2, bias=True)
         self.final_linear = nn.Linear(cfg.hidden_size, cfg.in_channels, bias=True)
-    def time_embed(self, t):
-        half_dim = 128
-        freqs = torch.exp(-math.log(10000) * torch.arange(half_dim, device=t.device) / half_dim)
-        args = t.unsqueeze(-1) * freqs * 1000
-        return torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    @staticmethod
-    def create_img_ids(batch_size, h, w, device):
-        img_ids = torch.zeros(h, w, 3, device=device)
-        img_ids[..., 1] = torch.arange(h, device=device).unsqueeze(1)
-        img_ids[..., 2] = torch.arange(w, device=device).unsqueeze(0)
-        return img_ids.reshape(1, h * w, 3).expand(batch_size, -1, -1)
-    def forward(self, hidden_states, encoder_hidden_states, pooled_projections, timestep, img_ids, guidance=None):
         img = self.img_in(hidden_states)
         txt = self.txt_in(encoder_hidden_states)
-        t_emb = self.time_embed(timestep).to(hidden_states.dtype)
-        cond = self.time_in(t_emb)
-        if guidance is not None and self.cfg.guidance_embeds:
-            g_emb = self.time_embed(guidance).to(hidden_states.dtype)
-            cond = cond + self.guidance_in(g_emb)
-        cond = cond + self.vector_in(pooled_projections)
-        rope = self.rope(img_ids)
         for block in self.double_blocks:
-            img, txt = block(img, txt, cond, rope)
-        x = torch.cat([txt, img], dim=1)
-        # Pad rope with identity for text positions (text has no positional encoding)
-        # RoPE format is interleaved [cos0, sin0, cos1, sin1, ...], identity = cos=1, sin=0
-        txt_len = txt.shape[1]
-        identity_rope = torch.zeros(rope.shape[0], txt_len, rope.shape[-1], device=rope.device, dtype=rope.dtype)
-        identity_rope[..., 0::2] = 1.0  # cos positions = 1, sin positions stay 0
-        full_rope = torch.cat([identity_rope, rope], dim=1)
         for block in self.single_blocks:
-            x = block(x, cond, full_rope)
-        img = x[:, txt.shape[1]:, :]
-        mod = self.final_mod(cond)
-        scale, shift = mod.chunk(2, dim=-1)
-        img = self.final_norm(img) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-        return self.final_linear(img)
 # ============================================================================
@@ -299,16 +339,15 @@ DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
 MAX_SEED = np.iinfo(np.int32).max
 SHIFT = 3.0
 # ============================================================================
-# LOAD MODELS (outside GPU function for ZeroGPU compatibility)
 # ============================================================================
 print("Loading TinyFlux-Lailah...")
-# Model
 config = TinyFluxDeepConfig()
 model = TinyFluxDeep(config)
-# Load EMA weights (best quality)
 weights_path = hf_hub_download("AbstractPhil/tiny-flux-deep", "checkpoints/step_297500_ema.safetensors")
 weights = load_file(weights_path)
 model.load_state_dict(weights, strict=False)
@@ -316,7 +355,6 @@ model.eval()
 model.to(DTYPE)
 print(f"✓ Model loaded ({sum(p.numel() for p in model.parameters()):,} params)")
-# Text encoders
 print("Loading text encoders...")
 t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
 t5_enc = T5EncoderModel.from_pretrained("google/flan-t5-base", torch_dtype=DTYPE)
@@ -324,19 +362,19 @@ clip_tok = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 clip_enc = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=DTYPE)
 print("✓ Text encoders loaded")
-# VAE (local diffusers format)
 print("Loading VAE...")
 vae = AutoencoderKL.from_pretrained("./vae", torch_dtype=DTYPE)
 vae.eval()
 VAE_SCALE = vae.config.scaling_factor
-print("✓ VAE loaded")
 # ============================================================================
-# INFERENCE FUNCTIONS
 # ============================================================================
-def flux_shift(t, s=SHIFT):
-    return s * t / (1 + (s - 1) * t)
 @spaces.GPU(duration=90)
@@ -351,76 +389,75 @@ def generate(
     num_inference_steps: int,
     progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate image with TinyFlux-Lailah."""
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
-    # Move models to GPU
     model.to(DEVICE)
     t5_enc.to(DEVICE)
     clip_enc.to(DEVICE)
     vae.to(DEVICE)
-    with torch.inference_mode():
         # Encode prompt
-        t5_in = t5_tok(
-            prompt, max_length=128, padding="max_length",
-            truncation=True, return_tensors="pt"
-        ).to(DEVICE)
-        t5_out = t5_enc(**t5_in).last_hidden_state.to(DTYPE)
-        clip_in = clip_tok(
-            prompt, max_length=77, padding="max_length",
-            truncation=True, return_tensors="pt"
-        ).to(DEVICE)
-        clip_out = clip_enc(**clip_in).pooler_output.to(DTYPE)
         # Latent dimensions
         H_lat = height // 8
         W_lat = width // 8
         C = 16
-        # Start from noise
         x = torch.randn(1, H_lat * W_lat, C, device=DEVICE, dtype=DTYPE, generator=generator)
-        img_ids = TinyFluxDeep.create_img_ids(1, H_lat, W_lat, DEVICE)
-        # Timesteps with Flux shift
-        t_linear = torch.linspace(0, 1, num_inference_steps + 1, device=DEVICE, dtype=DTYPE)
-        timesteps = flux_shift(t_linear, s=SHIFT)
-        # Euler sampling
         for i in range(num_inference_steps):
             t_curr = timesteps[i]
             t_next = timesteps[i + 1]
-            dt = t_next - t_curr
             t_batch = t_curr.unsqueeze(0)
             guidance = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
-            with torch.autocast(device_type="cuda", dtype=DTYPE):
-                v = model(
-                    hidden_states=x,
-                    encoder_hidden_states=t5_out,
-                    pooled_projections=clip_out,
-                    timestep=t_batch,
-                    img_ids=img_ids,
-                    guidance=guidance,
-                )
             x = x + v * dt
-        # Decode
         latents = x.reshape(1, H_lat, W_lat, C).permute(0, 3, 1, 2)
         latents = latents / VAE_SCALE
         image = vae.decode(latents.to(vae.dtype)).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # To PIL
         image = image[0].float().permute(1, 2, 0).cpu().numpy()
         image = (image * 255).astype(np.uint8)
         image = Image.fromarray(image)
     return image, seed
@@ -450,13 +487,13 @@ with gr.Blocks(css=css) as demo:
         **241M parameter** flow-matching text-to-image model.
         Trained on teacher latents from Flux-Schnell.
-        [Model Card](https://huggingface.co/AbstractPhil/tiny-flux-deep) |
-        [GitHub](https://github.com/AbstractPhil)
         """)
         with gr.Row():
             prompt = gr.Text(
                 label="Prompt",
                 show_label=False,
                 max_lines=2,
                 placeholder="Enter your prompt...",
@@ -470,78 +507,31 @@ with gr.Blocks(css=css) as demo:
             negative_prompt = gr.Text(
                 label="Negative prompt",
                 max_lines=1,
-                placeholder="(not used in this model)",
                 visible=False,
             )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=42,
-            )
             randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=768,
-                    step=64,
-                    value=512,
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=768,
-                    step=64,
-                    value=512,
-                )
             with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=1.0,
-                    maximum=10.0,
-                    step=0.5,
-                    value=3.5,
-                )
-                num_inference_steps = gr.Slider(
-                    label="Steps",
-                    minimum=10,
-                    maximum=50,
-                    step=1,
-                    value=28,
-                )
         gr.Examples(examples=examples, inputs=[prompt])
         gr.Markdown("""
         ---
-        **Notes:**
-        - Trained on 512×512 resolution
-        - Best results at guidance 3.0-5.0
-        - 20-30 steps recommended
-        - Early checkpoint - quality improving with training
         """)
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=generate,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
         outputs=[result, seed],
     )

 """
 TinyFlux-Lailah Gradio Demo
 HuggingFace Spaces with ZeroGPU support
+Euler discrete flow matching inference
 """
 import gradio as gr
 import random
 import spaces
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from transformers import T5EncoderModel, T5Tokenizer, CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL
 from PIL import Image
 # ============================================================================
+# MODEL DEFINITION - Exact copy from tinyflux_deep.py
 # ============================================================================
 @dataclass
     def __post_init__(self):
         assert self.num_attention_heads * self.attention_head_dim == self.hidden_size
+        assert sum(self.axes_dims_rope) == self.attention_head_dim
 class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine: bool = True):
         super().__init__()
         self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.register_parameter('weight', None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.float().pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
+        out = (x * norm).type_as(x)
+        if self.weight is not None:
+            out = out * self.weight
+        return out
 class EmbedND(nn.Module):
+    def __init__(self, theta: float = 10000.0, axes_dim: Tuple[int, int, int] = (16, 56, 56)):
         super().__init__()
         self.theta = theta
         self.axes_dim = axes_dim
             freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
             self.register_buffer(f'freqs_{i}', freqs, persistent=True)
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        device = ids.device
+        n_axes = ids.shape[-1]
+        emb_list = []
+        for i in range(n_axes):
+            freqs = getattr(self, f'freqs_{i}').to(device)
+            pos = ids[:, i].float()
+            angles = pos.unsqueeze(-1) * freqs.unsqueeze(0)
+            cos = angles.cos()
+            sin = angles.sin()
+            emb = torch.stack([cos, sin], dim=-1).flatten(-2)
+            emb_list.append(emb)
+        rope = torch.cat(emb_list, dim=-1)
+        return rope.unsqueeze(1)
+def apply_rotary_emb_old(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    freqs = freqs_cis.squeeze(1)
+    cos = freqs[:, 0::2].repeat_interleave(2, dim=-1)
+    sin = freqs[:, 1::2].repeat_interleave(2, dim=-1)
+    cos = cos[None, None, :, :].to(x.device)
+    sin = sin[None, None, :, :].to(x.device)
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(-2)
+    return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
 class MLPEmbedder(nn.Module):
+    def __init__(self, hidden_size: int):
         super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(256, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        half_dim = 128
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=x.device, dtype=x.dtype) * -emb)
+        emb = x.unsqueeze(-1) * emb.unsqueeze(0)
+        emb = torch.cat([emb.sin(), emb.cos()], dim=-1)
+        return self.mlp(emb)
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, hidden_size: int):
         super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        self.norm = RMSNorm(hidden_size)
+    def forward(self, x: torch.Tensor, emb: torch.Tensor):
+        emb_out = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb_out.chunk(6, dim=-1)
+        x = self.norm(x) * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class AdaLayerNormZeroSingle(nn.Module):
+    def __init__(self, hidden_size: int):
         super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(hidden_size, 3 * hidden_size, bias=True)
+        self.norm = RMSNorm(hidden_size)
+    def forward(self, x: torch.Tensor, emb: torch.Tensor):
+        emb_out = self.linear(self.silu(emb))
+        shift, scale, gate = emb_out.chunk(3, dim=-1)
+        x = self.norm(x) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        return x, gate
+class Attention(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, head_dim: int, use_bias: bool = False):
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = head_dim
+        self.qkv = nn.Linear(hidden_size, 3 * num_heads * head_dim, bias=use_bias)
+        self.out_proj = nn.Linear(num_heads * head_dim, hidden_size, bias=use_bias)
+    def forward(self, x: torch.Tensor, rope: Optional[torch.Tensor] = None) -> torch.Tensor:
         B, N, _ = x.shape
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
+        if rope is not None:
+            q = apply_rotary_emb_old(q, rope)
+            k = apply_rotary_emb_old(k, rope)
+        attn = F.scaled_dot_product_attention(q, k, v)
+        out = attn.transpose(1, 2).reshape(B, N, -1)
+        return self.out_proj(out)
+class JointAttention(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, head_dim: int, use_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.txt_qkv = nn.Linear(hidden_size, 3 * num_heads * head_dim, bias=use_bias)
+        self.img_qkv = nn.Linear(hidden_size, 3 * num_heads * head_dim, bias=use_bias)
+        self.txt_out = nn.Linear(num_heads * head_dim, hidden_size, bias=use_bias)
+        self.img_out = nn.Linear(num_heads * head_dim, hidden_size, bias=use_bias)
+    def forward(self, txt: torch.Tensor, img: torch.Tensor, rope: Optional[torch.Tensor] = None):
+        B, L, _ = txt.shape
+        _, N, _ = img.shape
+        txt_qkv = self.txt_qkv(txt).reshape(B, L, 3, self.num_heads, self.head_dim)
+        img_qkv = self.img_qkv(img).reshape(B, N, 3, self.num_heads, self.head_dim)
+        txt_q, txt_k, txt_v = txt_qkv.permute(2, 0, 3, 1, 4)
+        img_q, img_k, img_v = img_qkv.permute(2, 0, 3, 1, 4)
+        if rope is not None:
+            img_q = apply_rotary_emb_old(img_q, rope)
+            img_k = apply_rotary_emb_old(img_k, rope)
+        k = torch.cat([txt_k, img_k], dim=2)
+        v = torch.cat([txt_v, img_v], dim=2)
+        txt_out = F.scaled_dot_product_attention(txt_q, k, v)
+        txt_out = txt_out.transpose(1, 2).reshape(B, L, -1)
+        img_out = F.scaled_dot_product_attention(img_q, k, v)
+        img_out = img_out.transpose(1, 2).reshape(B, N, -1)
+        return self.txt_out(txt_out), self.img_out(img_out)
+class MLP(nn.Module):
+    def __init__(self, hidden_size: int, mlp_ratio: float = 4.0):
         super().__init__()
         mlp_hidden = int(hidden_size * mlp_ratio)
+        self.fc1 = nn.Linear(hidden_size, mlp_hidden, bias=True)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(mlp_hidden, hidden_size, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, config: TinyFluxDeepConfig):
+        super().__init__()
+        hidden = config.hidden_size
+        heads = config.num_attention_heads
+        head_dim = config.attention_head_dim
+        self.img_norm1 = AdaLayerNormZero(hidden)
+        self.txt_norm1 = AdaLayerNormZero(hidden)
+        self.attn = JointAttention(hidden, heads, head_dim, use_bias=False)
+        self.img_norm2 = RMSNorm(hidden)
+        self.txt_norm2 = RMSNorm(hidden)
+        self.img_mlp = MLP(hidden, config.mlp_ratio)
+        self.txt_mlp = MLP(hidden, config.mlp_ratio)
+    def forward(self, txt, img, vec, rope=None):
+        img_normed, img_gate_msa, img_shift_mlp, img_scale_mlp, img_gate_mlp = self.img_norm1(img, vec)
+        txt_normed, txt_gate_msa, txt_shift_mlp, txt_scale_mlp, txt_gate_mlp = self.txt_norm1(txt, vec)
+        txt_attn_out, img_attn_out = self.attn(txt_normed, img_normed, rope)
+        txt = txt + txt_gate_msa.unsqueeze(1) * txt_attn_out
+        img = img + img_gate_msa.unsqueeze(1) * img_attn_out
+        txt_mlp_in = self.txt_norm2(txt) * (1 + txt_scale_mlp.unsqueeze(1)) + txt_shift_mlp.unsqueeze(1)
+        img_mlp_in = self.img_norm2(img) * (1 + img_scale_mlp.unsqueeze(1)) + img_shift_mlp.unsqueeze(1)
+        txt = txt + txt_gate_mlp.unsqueeze(1) * self.txt_mlp(txt_mlp_in)
+        img = img + img_gate_mlp.unsqueeze(1) * self.img_mlp(img_mlp_in)
+        return txt, img
+class SingleStreamBlock(nn.Module):
+    def __init__(self, config: TinyFluxDeepConfig):
+        super().__init__()
+        hidden = config.hidden_size
+        heads = config.num_attention_heads
+        head_dim = config.attention_head_dim
+        self.norm = AdaLayerNormZeroSingle(hidden)
+        self.attn = Attention(hidden, heads, head_dim, use_bias=False)
+        self.mlp = MLP(hidden, config.mlp_ratio)
+        self.norm2 = RMSNorm(hidden)
+    def forward(self, txt, img, vec, rope=None):
+        L = txt.shape[1]
+        x = torch.cat([txt, img], dim=1)
+        x_normed, gate = self.norm(x, vec)
+        x = x + gate.unsqueeze(1) * self.attn(x_normed, rope)
+        x = x + self.mlp(self.norm2(x))
+        txt, img = x.split([L, x.shape[1] - L], dim=1)
+        return txt, img
 class TinyFluxDeep(nn.Module):
+    def __init__(self, config: Optional[TinyFluxDeepConfig] = None):
         super().__init__()
+        self.config = config or TinyFluxDeepConfig()
+        cfg = self.config
         self.img_in = nn.Linear(cfg.in_channels, cfg.hidden_size, bias=True)
         self.txt_in = nn.Linear(cfg.joint_attention_dim, cfg.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(cfg.hidden_size)
+        self.vector_in = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(cfg.pooled_projection_dim, cfg.hidden_size, bias=True)
+        )
+        if cfg.guidance_embeds:
+            self.guidance_in = MLPEmbedder(cfg.hidden_size)
+        self.rope = EmbedND(theta=10000.0, axes_dim=cfg.axes_dims_rope)
         self.double_blocks = nn.ModuleList([
+            DoubleStreamBlock(cfg) for _ in range(cfg.num_double_layers)
         ])
         self.single_blocks = nn.ModuleList([
+            SingleStreamBlock(cfg) for _ in range(cfg.num_single_layers)
         ])
+        self.final_norm = RMSNorm(cfg.hidden_size)
         self.final_linear = nn.Linear(cfg.hidden_size, cfg.in_channels, bias=True)
+    def forward(self, hidden_states, encoder_hidden_states, pooled_projections, timestep,
+                img_ids, txt_ids=None, guidance=None):
+        B = hidden_states.shape[0]
+        L = encoder_hidden_states.shape[1]
+        N = hidden_states.shape[1]
         img = self.img_in(hidden_states)
         txt = self.txt_in(encoder_hidden_states)
+        vec = self.time_in(timestep)
+        vec = vec + self.vector_in(pooled_projections)
+        if self.config.guidance_embeds and guidance is not None:
+            vec = vec + self.guidance_in(guidance)
+        if img_ids.ndim == 3:
+            img_ids = img_ids[0]
+        img_rope = self.rope(img_ids)
         for block in self.double_blocks:
+            txt, img = block(txt, img, vec, img_rope)
+        if txt_ids is None:
+            txt_ids = torch.zeros(L, 3, device=img_ids.device, dtype=img_ids.dtype)
+        elif txt_ids.ndim == 3:
+            txt_ids = txt_ids[0]
+        all_ids = torch.cat([txt_ids, img_ids], dim=0)
+        full_rope = self.rope(all_ids)
         for block in self.single_blocks:
+            txt, img = block(txt, img, vec, full_rope)
+        img = self.final_norm(img)
+        img = self.final_linear(img)
+        return img
+    @staticmethod
+    def create_img_ids(batch_size: int, height: int, width: int, device) -> torch.Tensor:
+        img_ids = torch.zeros(height * width, 3, device=device)
+        for i in range(height):
+            for j in range(width):
+                idx = i * width + j
+                img_ids[idx, 0] = 0
+                img_ids[idx, 1] = i
+                img_ids[idx, 2] = j
+        return img_ids
+    @staticmethod
+    def create_txt_ids(text_len: int, device) -> torch.Tensor:
+        txt_ids = torch.zeros(text_len, 3, device=device)
+        txt_ids[:, 0] = torch.arange(text_len, device=device)
+        return txt_ids
 # ============================================================================
 MAX_SEED = np.iinfo(np.int32).max
 SHIFT = 3.0
 # ============================================================================
+# LOAD MODELS
 # ============================================================================
 print("Loading TinyFlux-Lailah...")
 config = TinyFluxDeepConfig()
 model = TinyFluxDeep(config)
 weights_path = hf_hub_download("AbstractPhil/tiny-flux-deep", "checkpoints/step_297500_ema.safetensors")
 weights = load_file(weights_path)
 model.load_state_dict(weights, strict=False)
 model.to(DTYPE)
 print(f"✓ Model loaded ({sum(p.numel() for p in model.parameters()):,} params)")
 print("Loading text encoders...")
 t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
 t5_enc = T5EncoderModel.from_pretrained("google/flan-t5-base", torch_dtype=DTYPE)
 clip_enc = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=DTYPE)
 print("✓ Text encoders loaded")
 print("Loading VAE...")
 vae = AutoencoderKL.from_pretrained("./vae", torch_dtype=DTYPE)
 vae.eval()
 VAE_SCALE = vae.config.scaling_factor
+print(f"✓ VAE loaded (scale={VAE_SCALE})")
 # ============================================================================
+# EULER DISCRETE FLOW MATCHING SAMPLER
 # ============================================================================
+def flux_shift(t, shift=SHIFT):
+    """Flux time shift: s*t / (1 + (s-1)*t)"""
+    return shift * t / (1 + (shift - 1) * t)
 @spaces.GPU(duration=90)
     num_inference_steps: int,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    # Move to GPU
     model.to(DEVICE)
     t5_enc.to(DEVICE)
     clip_enc.to(DEVICE)
     vae.to(DEVICE)
+    with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=DTYPE):
         # Encode prompt
+        t5_in = t5_tok(prompt, max_length=128, padding="max_length",
+                       truncation=True, return_tensors="pt").to(DEVICE)
+        t5_out = t5_enc(**t5_in).last_hidden_state
+        clip_in = clip_tok(prompt, max_length=77, padding="max_length",
+                          truncation=True, return_tensors="pt").to(DEVICE)
+        clip_out = clip_enc(**clip_in).pooler_output
         # Latent dimensions
         H_lat = height // 8
         W_lat = width // 8
         C = 16
+        L = 128  # T5 sequence length
+        # Start from noise (t=1 in flow matching)
         x = torch.randn(1, H_lat * W_lat, C, device=DEVICE, dtype=DTYPE, generator=generator)
+        # Position IDs
+        img_ids = TinyFluxDeep.create_img_ids(1, H_lat, W_lat, DEVICE)
+        txt_ids = TinyFluxDeep.create_txt_ids(L, DEVICE)
+        # Timesteps: 1 -> 0 with Flux shift
+        t_linear = torch.linspace(1, 0, num_inference_steps + 1, device=DEVICE)
+        timesteps = flux_shift(t_linear, shift=SHIFT)
+        # Euler discrete flow matching: x_{t-dt} = x_t + v * dt
         for i in range(num_inference_steps):
             t_curr = timesteps[i]
             t_next = timesteps[i + 1]
+            dt = t_next - t_curr  # Negative since going 1->0
             t_batch = t_curr.unsqueeze(0)
             guidance = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
+            v = model(
+                hidden_states=x,
+                encoder_hidden_states=t5_out,
+                pooled_projections=clip_out,
+                timestep=t_batch,
+                img_ids=img_ids,
+                txt_ids=txt_ids,
+                guidance=guidance,
+            )
             x = x + v * dt
+        # Decode latents
         latents = x.reshape(1, H_lat, W_lat, C).permute(0, 3, 1, 2)
         latents = latents / VAE_SCALE
         image = vae.decode(latents.to(vae.dtype)).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # To PIL
         image = image[0].float().permute(1, 2, 0).cpu().numpy()
         image = (image * 255).astype(np.uint8)
         image = Image.fromarray(image)
     return image, seed
         **241M parameter** flow-matching text-to-image model.
         Trained on teacher latents from Flux-Schnell.
+        [Model Card](https://huggingface.co/AbstractPhil/tiny-flux-deep)
         """)
         with gr.Row():
             prompt = gr.Text(
                 label="Prompt",
+                value="cat",
                 show_label=False,
                 max_lines=2,
                 placeholder="Enter your prompt...",
             negative_prompt = gr.Text(
                 label="Negative prompt",
                 max_lines=1,
+                placeholder="(not used)",
                 visible=False,
             )
+            seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
             randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
+                width = gr.Slider(label="Width", minimum=256, maximum=1024, step=64, value=512)
+                height = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=512)
             with gr.Row():
+                guidance_scale = gr.Slider(label="Guidance", minimum=1.0, maximum=10.0, step=0.5, value=3.5)
+                num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=1, value=28)
         gr.Examples(examples=examples, inputs=[prompt])
         gr.Markdown("""
         ---
+        **Notes:** Trained at 512×512. Best results at guidance 3.0-5.0, 20-30 steps.
         """)
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=generate,
+        inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
         outputs=[result, seed],
     )