Spaces:

AbstractPhil
/

tinyflux-lailah

Sleeping

App Files Files Community

AbstractPhil commited on Jan 25

Commit

6a487eb

verified ·

1 Parent(s): 0832ec9

Create app.py

Browse files

Files changed (1) hide show

app.py +539 -0

app.py ADDED Viewed

	@@ -0,0 +1,539 @@

+"""
+TinyFlux-Lailah Gradio Demo
+HuggingFace Spaces with ZeroGPU support
+"""
+import gradio as gr
+import numpy as np
+import random
+import spaces
+import torch
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import T5EncoderModel, T5Tokenizer, CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL
+from PIL import Image
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from dataclasses import dataclass
+from typing import Tuple
+# ============================================================================
+# MODEL DEFINITION (TinyFluxDeep / Lailah)
+# ============================================================================
+@dataclass
+class TinyFluxDeepConfig:
+    hidden_size: int = 512
+    num_attention_heads: int = 4
+    attention_head_dim: int = 128
+    in_channels: int = 16
+    patch_size: int = 1
+    joint_attention_dim: int = 768
+    pooled_projection_dim: int = 768
+    num_double_layers: int = 15
+    num_single_layers: int = 25
+    mlp_ratio: float = 4.0
+    axes_dims_rope: Tuple[int, int, int] = (16, 56, 56)
+    guidance_embeds: bool = True
+    def __post_init__(self):
+        assert self.num_attention_heads * self.attention_head_dim == self.hidden_size
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        dtype = x.dtype
+        x = x.float()
+        norm = x.pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
+        return (x * norm).to(dtype) * self.weight
+class EmbedND(nn.Module):
+    def __init__(self, theta=10000.0, axes_dim=(16, 56, 56)):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        for i, dim in enumerate(axes_dim):
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+            self.register_buffer(f'freqs_{i}', freqs, persistent=True)
+    def forward(self, ids):
+        rope_components = []
+        for i, dim in enumerate(self.axes_dim):
+            freqs = getattr(self, f'freqs_{i}').to(ids.device)
+            axis_ids = ids[..., i:i+1]
+            angles = axis_ids * freqs
+            cos = torch.cos(angles)
+            sin = torch.sin(angles)
+            interleaved = torch.stack([cos, sin], dim=-1).flatten(-2)
+            rope_components.append(interleaved)
+        return torch.cat(rope_components, dim=-1)
+def apply_rope(x, rope):
+    B, H, N, D = x.shape
+    rope = rope[:, :N, :D]
+    rope = rope.unsqueeze(1)
+    x_pairs = x.reshape(B, H, N, D // 2, 2)
+    cos = rope[..., 0::2]
+    sin = rope[..., 1::2]
+    x_rot = torch.stack([
+        x_pairs[..., 0] * cos - x_pairs[..., 1] * sin,
+        x_pairs[..., 1] * cos + x_pairs[..., 0] * sin,
+    ], dim=-1)
+    return x_rot.flatten(-2)
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim, hidden_dim):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
+    def forward(self, x):
+        return self.fc2(F.silu(self.fc1(x)))
+class QKNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q, k):
+        return self.query_norm(q), self.key_norm(k)
+class DoubleAttention(nn.Module):
+    def __init__(self, hidden_size, num_heads, head_dim):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        qkv_dim = num_heads * head_dim * 3
+        self.img_qkv = nn.Linear(hidden_size, qkv_dim, bias=True)
+        self.img_out = nn.Linear(num_heads * head_dim, hidden_size, bias=True)
+        self.txt_qkv = nn.Linear(hidden_size, qkv_dim, bias=True)
+        self.txt_out = nn.Linear(num_heads * head_dim, hidden_size, bias=True)
+        self.img_norm = QKNorm(head_dim)
+        self.txt_norm = QKNorm(head_dim)
+    def forward(self, img, txt, rope):
+        B, N_img, _ = img.shape
+        N_txt = txt.shape[1]
+        img_qkv = self.img_qkv(img).reshape(B, N_img, 3, self.num_heads, self.head_dim)
+        img_q, img_k, img_v = img_qkv.permute(2, 0, 3, 1, 4).unbind(0)
+        img_q, img_k = self.img_norm(img_q, img_k)
+        img_q = apply_rope(img_q, rope)
+        img_k = apply_rope(img_k, rope)
+        txt_qkv = self.txt_qkv(txt).reshape(B, N_txt, 3, self.num_heads, self.head_dim)
+        txt_q, txt_k, txt_v = txt_qkv.permute(2, 0, 3, 1, 4).unbind(0)
+        txt_q, txt_k = self.txt_norm(txt_q, txt_k)
+        q = torch.cat([txt_q, img_q], dim=2)
+        k = torch.cat([txt_k, img_k], dim=2)
+        v = torch.cat([txt_v, img_v], dim=2)
+        attn_out = F.scaled_dot_product_attention(q, k, v)
+        txt_out, img_out = attn_out.split([N_txt, N_img], dim=2)
+        img_out = img_out.transpose(1, 2).reshape(B, N_img, -1)
+        txt_out = txt_out.transpose(1, 2).reshape(B, N_txt, -1)
+        return self.img_out(img_out), self.txt_out(txt_out)
+class DoubleBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mod = nn.Linear(hidden_size, hidden_size * 6, bias=True)
+        self.txt_mod = nn.Linear(hidden_size, hidden_size * 6, bias=True)
+        self.attn = DoubleAttention(hidden_size, num_heads, head_dim)
+        mlp_hidden = int(hidden_size * mlp_ratio)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden, hidden_size, bias=True),
+        )
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden, hidden_size, bias=True),
+        )
+    def forward(self, img, txt, cond, rope):
+        img_mod = self.img_mod(cond)
+        img_scale1, img_shift1, img_gate1, img_scale2, img_shift2, img_gate2 = img_mod.chunk(6, dim=-1)
+        txt_mod = self.txt_mod(cond)
+        txt_scale1, txt_shift1, txt_gate1, txt_scale2, txt_shift2, txt_gate2 = txt_mod.chunk(6, dim=-1)
+        img_normed = self.img_norm1(img) * (1 + img_scale1.unsqueeze(1)) + img_shift1.unsqueeze(1)
+        txt_normed = self.txt_norm1(txt) * (1 + txt_scale1.unsqueeze(1)) + txt_shift1.unsqueeze(1)
+        img_attn, txt_attn = self.attn(img_normed, txt_normed, rope)
+        img = img + img_gate1.unsqueeze(1) * img_attn
+        txt = txt + txt_gate1.unsqueeze(1) * txt_attn
+        img_normed2 = self.img_norm2(img) * (1 + img_scale2.unsqueeze(1)) + img_shift2.unsqueeze(1)
+        txt_normed2 = self.txt_norm2(txt) * (1 + txt_scale2.unsqueeze(1)) + txt_shift2.unsqueeze(1)
+        img = img + img_gate2.unsqueeze(1) * self.img_mlp(img_normed2)
+        txt = txt + txt_gate2.unsqueeze(1) * self.txt_mlp(txt_normed2)
+        return img, txt
+class SingleAttention(nn.Module):
+    def __init__(self, hidden_size, num_heads, head_dim):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.qkv = nn.Linear(hidden_size, num_heads * head_dim * 3, bias=True)
+        self.norm = QKNorm(head_dim)
+    def forward(self, x, rope):
+        B, N, _ = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0)
+        q, k = self.norm(q, k)
+        q = apply_rope(q, rope)
+        k = apply_rope(k, rope)
+        out = F.scaled_dot_product_attention(q, k, v)
+        return out.transpose(1, 2).reshape(B, N, -1)
+class SingleBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mod = nn.Linear(hidden_size, hidden_size * 3, bias=True)
+        self.attn = SingleAttention(hidden_size, num_heads, head_dim)
+        self.proj = nn.Linear(num_heads * head_dim, hidden_size, bias=True)
+        mlp_hidden = int(hidden_size * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden, hidden_size, bias=True),
+        )
+    def forward(self, x, cond, rope):
+        mod = self.mod(cond)
+        scale, shift, gate = mod.chunk(3, dim=-1)
+        normed = self.norm(x) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        attn_out = self.proj(self.attn(normed, rope))
+        mlp_out = self.mlp(normed)
+        return x + gate.unsqueeze(1) * (attn_out + mlp_out)
+class TinyFluxDeep(nn.Module):
+    def __init__(self, cfg: TinyFluxDeepConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.img_in = nn.Linear(cfg.in_channels, cfg.hidden_size, bias=True)
+        self.txt_in = nn.Linear(cfg.joint_attention_dim, cfg.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(256, cfg.hidden_size)
+        self.guidance_in = MLPEmbedder(256, cfg.hidden_size)
+        self.vector_in = MLPEmbedder(cfg.pooled_projection_dim, cfg.hidden_size)
+        self.rope = EmbedND(axes_dim=cfg.axes_dims_rope)
+        self.double_blocks = nn.ModuleList([
+            DoubleBlock(cfg.hidden_size, cfg.num_attention_heads, cfg.attention_head_dim, cfg.mlp_ratio)
+            for _ in range(cfg.num_double_layers)
+        ])
+        self.single_blocks = nn.ModuleList([
+            SingleBlock(cfg.hidden_size, cfg.num_attention_heads, cfg.attention_head_dim, cfg.mlp_ratio)
+            for _ in range(cfg.num_single_layers)
+        ])
+        self.final_norm = nn.LayerNorm(cfg.hidden_size, elementwise_affine=False, eps=1e-6)
+        self.final_mod = nn.Linear(cfg.hidden_size, cfg.hidden_size * 2, bias=True)
+        self.final_linear = nn.Linear(cfg.hidden_size, cfg.in_channels, bias=True)
+    def time_embed(self, t):
+        half_dim = 128
+        freqs = torch.exp(-math.log(10000) * torch.arange(half_dim, device=t.device) / half_dim)
+        args = t.unsqueeze(-1) * freqs * 1000
+        return torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    @staticmethod
+    def create_img_ids(batch_size, h, w, device):
+        img_ids = torch.zeros(h, w, 3, device=device)
+        img_ids[..., 1] = torch.arange(h, device=device).unsqueeze(1)
+        img_ids[..., 2] = torch.arange(w, device=device).unsqueeze(0)
+        return img_ids.reshape(1, h * w, 3).expand(batch_size, -1, -1)
+    def forward(self, hidden_states, encoder_hidden_states, pooled_projections, timestep, img_ids, guidance=None):
+        img = self.img_in(hidden_states)
+        txt = self.txt_in(encoder_hidden_states)
+        t_emb = self.time_embed(timestep)
+        cond = self.time_in(t_emb)
+        if guidance is not None and self.cfg.guidance_embeds:
+            g_emb = self.time_embed(guidance)
+            cond = cond + self.guidance_in(g_emb)
+        cond = cond + self.vector_in(pooled_projections)
+        rope = self.rope(img_ids)
+        for block in self.double_blocks:
+            img, txt = block(img, txt, cond, rope)
+        x = torch.cat([txt, img], dim=1)
+        for block in self.single_blocks:
+            x = block(x, cond, rope)
+        img = x[:, txt.shape[1]:, :]
+        mod = self.final_mod(cond)
+        scale, shift = mod.chunk(2, dim=-1)
+        img = self.final_norm(img) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        return self.final_linear(img)
+# ============================================================================
+# GLOBALS
+# ============================================================================
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+MAX_SEED = np.iinfo(np.int32).max
+SHIFT = 3.0
+# ============================================================================
+# LOAD MODELS (outside GPU function for ZeroGPU compatibility)
+# ============================================================================
+print("Loading TinyFlux-Lailah...")
+# Model
+config = TinyFluxDeepConfig()
+model = TinyFluxDeep(config)
+# Load EMA weights (best quality)
+weights_path = hf_hub_download("AbstractPhil/tiny-flux-deep", "checkpoints/step_286250_ema.safetensors")
+weights = load_file(weights_path)
+model.load_state_dict(weights, strict=False)
+model.eval()
+model.to(DTYPE)
+print(f"✓ Model loaded ({sum(p.numel() for p in model.parameters()):,} params)")
+# Text encoders
+print("Loading text encoders...")
+t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
+t5_enc = T5EncoderModel.from_pretrained("google/flan-t5-base", torch_dtype=DTYPE)
+clip_tok = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+clip_enc = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=DTYPE)
+print("✓ Text encoders loaded")
+# VAE
+print("Loading VAE...")
+vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-schnell", subfolder="vae", torch_dtype=DTYPE)
+VAE_SCALE = vae.config.scaling_factor
+print("✓ VAE loaded")
+# ============================================================================
+# INFERENCE FUNCTIONS
+# ============================================================================
+def flux_shift(t, s=SHIFT):
+    return s * t / (1 + (s - 1) * t)
+@spaces.GPU(duration=90)
+def generate(
+    prompt: str,
+    negative_prompt: str,
+    seed: int,
+    randomize_seed: bool,
+    width: int,
+    height: int,
+    guidance_scale: float,
+    num_inference_steps: int,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Generate image with TinyFlux-Lailah."""
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    # Move models to GPU
+    model.to(DEVICE)
+    t5_enc.to(DEVICE)
+    clip_enc.to(DEVICE)
+    vae.to(DEVICE)
+    with torch.inference_mode():
+        # Encode prompt
+        t5_in = t5_tok(
+            prompt, max_length=128, padding="max_length",
+            truncation=True, return_tensors="pt"
+        ).to(DEVICE)
+        t5_out = t5_enc(**t5_in).last_hidden_state.to(DTYPE)
+        clip_in = clip_tok(
+            prompt, max_length=77, padding="max_length",
+            truncation=True, return_tensors="pt"
+        ).to(DEVICE)
+        clip_out = clip_enc(**clip_in).pooler_output.to(DTYPE)
+        # Latent dimensions
+        H_lat = height // 8
+        W_lat = width // 8
+        C = 16
+        # Start from noise
+        x = torch.randn(1, H_lat * W_lat, C, device=DEVICE, dtype=DTYPE, generator=generator)
+        img_ids = TinyFluxDeep.create_img_ids(1, H_lat, W_lat, DEVICE)
+        # Timesteps with Flux shift
+        t_linear = torch.linspace(0, 1, num_inference_steps + 1, device=DEVICE, dtype=DTYPE)
+        timesteps = flux_shift(t_linear, s=SHIFT)
+        # Euler sampling
+        for i in range(num_inference_steps):
+            t_curr = timesteps[i]
+            t_next = timesteps[i + 1]
+            dt = t_next - t_curr
+            t_batch = t_curr.unsqueeze(0)
+            guidance = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
+            v = model(
+                hidden_states=x,
+                encoder_hidden_states=t5_out,
+                pooled_projections=clip_out,
+                timestep=t_batch,
+                img_ids=img_ids,
+                guidance=guidance,
+            )
+            x = x + v * dt
+        # Decode
+        latents = x.reshape(1, H_lat, W_lat, C).permute(0, 3, 1, 2)
+        latents = latents / VAE_SCALE
+        image = vae.decode(latents.to(vae.dtype)).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # To PIL
+        image = image[0].float().permute(1, 2, 0).cpu().numpy()
+        image = (image * 255).astype(np.uint8)
+        image = Image.fromarray(image)
+    return image, seed
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+examples = [
+    "a photo of a cat sitting on a windowsill",
+    "a portrait of a woman with red hair, professional photography",
+    "a black backpack on white background, product photo",
+    "astronaut riding a horse on mars, digital art",
+    "a cozy coffee shop interior, warm lighting",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 720px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("""
+        # TinyFlux-Lailah
+        **241M parameter** flow-matching text-to-image model.
+        Trained on teacher latents from Flux-Schnell.
+        [Model Card](https://huggingface.co/AbstractPhil/tiny-flux-deep) |
+        [GitHub](https://github.com/AbstractPhil)
+        """)
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=2,
+                placeholder="Enter your prompt...",
+                container=False,
+            )
+            run_button = gr.Button("Generate", scale=0, variant="primary")
+        result = gr.Image(label="Result", show_label=False)
+        with gr.Accordion("Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="(not used in this model)",
+                visible=False,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=42,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=768,
+                    step=64,
+                    value=512,
+                )
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=768,
+                    step=64,
+                    value=512,
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=1.0,
+                    maximum=10.0,
+                    step=0.5,
+                    value=3.5,
+                )
+                num_inference_steps = gr.Slider(
+                    label="Steps",
+                    minimum=10,
+                    maximum=50,
+                    step=1,
+                    value=28,
+                )
+        gr.Examples(examples=examples, inputs=[prompt])
+        gr.Markdown("""
+        ---
+        **Notes:**
+        - Trained on 512×512 resolution
+        - Best results at guidance 3.0-5.0
+        - 20-30 steps recommended
+        - Early checkpoint - quality improving with training
+        """)
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=generate,
+        inputs=[
+            prompt,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            width,
+            height,
+            guidance_scale,
+            num_inference_steps,
+        ],
+        outputs=[result, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()