Spaces:

AbstractPhil
/

tinyflux-lailah

Running on Zero

App Files Files Community

AbstractPhil commited on 14 days ago

Commit

083a11a

verified ·

1 Parent(s): d4b69df

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -25

app.py CHANGED Viewed

@@ -370,7 +370,7 @@ print(f"✓ VAE loaded (scale={VAE_SCALE})")
 # ============================================================================
-# EULER DISCRETE FLOW MATCHING SAMPLER
 # Training uses: x_t = (1-t)*noise + t*data, v = data - noise
 # So t=0 is noise, t=1 is data. We sample from t=0 to t=1.
 # ============================================================================
@@ -403,51 +403,82 @@ def generate(
     vae.to(DEVICE)
     with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=DTYPE):
-        # Encode prompt
         t5_in = t5_tok(prompt, max_length=128, padding="max_length",
                        truncation=True, return_tensors="pt").to(DEVICE)
-        t5_out = t5_enc(**t5_in).last_hidden_state
         clip_in = clip_tok(prompt, max_length=77, padding="max_length",
                           truncation=True, return_tensors="pt").to(DEVICE)
-        clip_out = clip_enc(**clip_in).pooler_output
         # Latent dimensions
         H_lat = height // 8
         W_lat = width // 8
         C = 16
-        L = 128  # T5 sequence length
         # Start from noise (t=0 in this convention)
         x = torch.randn(1, H_lat * W_lat, C, device=DEVICE, dtype=DTYPE, generator=generator)
         # Position IDs
         img_ids = TinyFluxDeep.create_img_ids(1, H_lat, W_lat, DEVICE)
-        txt_ids = TinyFluxDeep.create_txt_ids(L, DEVICE)
         # Timesteps: 0 -> 1 (noise to data) with Flux shift
         t_linear = torch.linspace(0, 1, num_inference_steps + 1, device=DEVICE)
-        timesteps = flux_shift(t_linear, shift=SHIFT).clamp(1e-4, 1 - 1e-4)
         # Euler flow matching: x_{t+dt} = x_t + v * dt
-        # v predicts direction from noise to data
         for i in range(num_inference_steps):
             t_curr = timesteps[i]
             t_next = timesteps[i + 1]
-            dt = t_next - t_curr  # Positive since going 0->1
             t_batch = t_curr.unsqueeze(0)
-            guidance = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
-            v = model(
-                hidden_states=x,
-                encoder_hidden_states=t5_out,
-                pooled_projections=clip_out,
-                timestep=t_batch,
-                img_ids=img_ids,
-                txt_ids=txt_ids,
-                guidance=guidance,
-            )
             x = x + v * dt
         # Decode latents
@@ -509,8 +540,8 @@ with gr.Blocks(css=css) as demo:
             negative_prompt = gr.Text(
                 label="Negative prompt",
                 max_lines=1,
-                placeholder="(not used)",
-                visible=False,
             )
             seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
             randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
@@ -520,14 +551,14 @@ with gr.Blocks(css=css) as demo:
                 height = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=512)
             with gr.Row():
-                guidance_scale = gr.Slider(label="Guidance", minimum=1.0, maximum=10.0, step=0.5, value=3.5)
-                num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=1, value=28)
         gr.Examples(examples=examples, inputs=[prompt])
         gr.Markdown("""
         ---
-        **Notes:** Trained at 512×512. Best results at guidance 3.0-5.0, 20-30 steps.
         """)
     gr.on(

 # ============================================================================
+# EULER DISCRETE FLOW MATCHING SAMPLER WITH CFG
 # Training uses: x_t = (1-t)*noise + t*data, v = data - noise
 # So t=0 is noise, t=1 is data. We sample from t=0 to t=1.
 # ============================================================================
     vae.to(DEVICE)
     with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=DTYPE):
+        # Encode prompts
         t5_in = t5_tok(prompt, max_length=128, padding="max_length",
                        truncation=True, return_tensors="pt").to(DEVICE)
+        t5_cond = t5_enc(**t5_in).last_hidden_state
         clip_in = clip_tok(prompt, max_length=77, padding="max_length",
                           truncation=True, return_tensors="pt").to(DEVICE)
+        clip_cond = clip_enc(**clip_in).pooler_output
+        # Encode negative prompt for CFG
+        do_cfg = guidance_scale > 1.0
+        if do_cfg:
+            neg_prompt = negative_prompt if negative_prompt else ""
+            t5_neg_in = t5_tok(neg_prompt, max_length=128, padding="max_length",
+                               truncation=True, return_tensors="pt").to(DEVICE)
+            t5_uncond = t5_enc(**t5_neg_in).last_hidden_state
+            clip_neg_in = clip_tok(neg_prompt, max_length=77, padding="max_length",
+                                   truncation=True, return_tensors="pt").to(DEVICE)
+            clip_uncond = clip_enc(**clip_neg_in).pooler_output
+            # Batch for efficient forward pass
+            t5_batch = torch.cat([t5_uncond, t5_cond], dim=0)
+            clip_batch = torch.cat([clip_uncond, clip_cond], dim=0)
         # Latent dimensions
         H_lat = height // 8
         W_lat = width // 8
         C = 16
         # Start from noise (t=0 in this convention)
         x = torch.randn(1, H_lat * W_lat, C, device=DEVICE, dtype=DTYPE, generator=generator)
         # Position IDs
         img_ids = TinyFluxDeep.create_img_ids(1, H_lat, W_lat, DEVICE)
         # Timesteps: 0 -> 1 (noise to data) with Flux shift
         t_linear = torch.linspace(0, 1, num_inference_steps + 1, device=DEVICE)
+        timesteps = flux_shift(t_linear, shift=SHIFT)
         # Euler flow matching: x_{t+dt} = x_t + v * dt
         for i in range(num_inference_steps):
             t_curr = timesteps[i]
             t_next = timesteps[i + 1]
+            dt = t_next - t_curr
             t_batch = t_curr.unsqueeze(0)
+            guidance_embed = torch.tensor([guidance_scale], device=DEVICE, dtype=DTYPE)
+            if do_cfg:
+                # Batched forward pass for efficiency
+                x_batch = x.repeat(2, 1, 1)
+                img_ids_batch = img_ids
+                t_batch_2 = t_batch.repeat(2)
+                guidance_batch = guidance_embed.repeat(2)
+                v_batch = model(
+                    hidden_states=x_batch,
+                    encoder_hidden_states=t5_batch,
+                    pooled_projections=clip_batch,
+                    timestep=t_batch_2,
+                    img_ids=img_ids_batch,
+                    guidance=guidance_batch,
+                )
+                v_uncond, v_cond = v_batch.chunk(2, dim=0)
+                v = v_uncond + guidance_scale * (v_cond - v_uncond)
+            else:
+                v = model(
+                    hidden_states=x,
+                    encoder_hidden_states=t5_cond,
+                    pooled_projections=clip_cond,
+                    timestep=t_batch,
+                    img_ids=img_ids,
+                    guidance=guidance_embed,
+                )
             x = x + v * dt
         # Decode latents
             negative_prompt = gr.Text(
                 label="Negative prompt",
                 max_lines=1,
+                placeholder="blurry, distorted, low quality",
+                value="",
             )
             seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
             randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                 height = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=512)
             with gr.Row():
+                guidance_scale = gr.Slider(label="CFG Scale", minimum=1.0, maximum=10.0, step=0.5, value=5.0)
+                num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=1, value=25)
         gr.Examples(examples=examples, inputs=[prompt])
         gr.Markdown("""
         ---
+        **Notes:** Trained at 512×512. CFG 3.0-7.0 recommended, 20-30 steps.
         """)
     gr.on(