Spaces:

1inkusFace
/

StableDiffusion-3.5-Large-lora

Paused

App Files Files Community

1inkusFace commited on Oct 13, 2025

Commit

a0eb807

verified ·

1 Parent(s): 7ea2c56

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -76

app.py CHANGED Viewed

@@ -47,46 +47,6 @@ from image_gen_aux import UpscaleWithModel
 from diffusers.models.attention_processor import AttnProcessor2_0
 from diffusers.models.attention_processor import Attention
-from kernels import get_kernel
-vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
-class FlashAttentionProcessor(AttnProcessor2_0):
-    """
-    A custom attention processor that uses a pre-compiled Flash Attention 3 kernel.
-    It inherits from AttnProcessor2_0, which is compatible with PyTorch 2.x attention.
-    """
-    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, **kwargs):
-        # The 'attn' argument is the parent Attention module, giving access to its parameters.
-        # The implementation from the kernels library expects query, key, and value in a
-        # specific format (Batch, Sequence, Heads, Dim_Head), so we must reshape accordingly.
-        query = attn.to_q(hidden_states)
-        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        scale = attn.scale
-        query = query * scale
-        b, t, c = query.shape
-        h = attn.heads
-        d = c // h
-        # Reshape to (Batch, Heads, Sequence, Dim_Head) for the Flash Attention kernel
-        q_reshaped = query.reshape(b, t, h, d).permute(0, 2, 1, 3)
-        k_reshaped = key.reshape(b, t, h, d).permute(0, 2, 1, 3)
-        v_reshaped = value.reshape(b, t, h, d).permute(0, 2, 1, 3)
-        out_reshaped = torch.empty_like(q_reshaped)
-        # Call the pre-compiled kernel
-        vllm_flash_attn3.attention(q_reshaped, k_reshaped, v_reshaped, out_reshaped)
-        # Reshape output back to (Batch, Sequence, Heads * Dim_Head)
-        out = out_reshaped.permute(0, 2, 1, 3).reshape(b, t, c)
-        out = attn.to_out(out)
-        return out
 # --- GCS Configuration ---
@@ -123,49 +83,15 @@ def upload_to_gcs(image_object, filename):
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-import torch.export
 @spaces.GPU(duration=120)
 def compile_transformer():
     with spaces.aoti_capture(pipe.transformer) as call:
-        # This run captures the structure of the inputs in call.args and call.kwargs
-        pipe(
-            "A majestic, ancient Egyptian Sphinx stands sentinel in a large, clear pool under a bright, golden desert sun. Around its weathered stone base, several sleek, playful dolphins gracefully navigate the turquoise waters. The surrounding environment features lush, exotic papyrus plants and distant pyramids under a cloudless sky, conveying a sense of timeless wonder and serene majesty."
-        )
-    # --- START OF CHANGE ---
-    dynamic_shapes = {
-        # Give the two different sequence lengths unique names
-        "hidden_states": {
-            0: torch.export.Dim("batch_size"),
-            1: torch.export.Dim("image_sequence_length"),  # <-- Unique name
-        },
-        "encoder_hidden_states": {
-            0: torch.export.Dim("batch_size"),
-            1: torch.export.Dim("text_sequence_length"),   # <-- Unique name
-        },
-        # The rest remains the same
-        "pooled_projections": {
-            0: torch.export.Dim("batch_size"),
-        },
-        "timestep": {
-            0: torch.export.Dim("batch_size"),
-        },
-        "joint_attention_kwargs": None,
-        "return_dict": None,
-    }
-    # --- END OF CHANGE ---
     exported = torch.export.export(
         pipe.transformer,
         args=call.args,
         kwargs=call.kwargs,
-        dynamic_shapes=dynamic_shapes,
     )
     return spaces.aoti_compile(exported)
 def load_model():
@@ -185,7 +111,7 @@ def load_model():
     upscaler_2 = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(device)
     return pipe, upscaler_2
-#fa_processor = FlashAttentionProcessor()
 pipe, upscaler_2 = load_model()

 from diffusers.models.attention_processor import AttnProcessor2_0
 from diffusers.models.attention_processor import Attention
 # --- GCS Configuration ---
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 @spaces.GPU(duration=120)
 def compile_transformer():
     with spaces.aoti_capture(pipe.transformer) as call:
+        pipe("A majestic, ancient Egyptian Sphinx stands sentinel in a large, clear pool under a bright, golden desert sun. Around its weathered stone base, several sleek, playful dolphins gracefully navigate the turquoise waters. The surrounding environment features lush, exotic papyrus plants and distant pyramids under a cloudless sky, conveying a sense of timeless wonder and serene majesty.")
     exported = torch.export.export(
         pipe.transformer,
         args=call.args,
         kwargs=call.kwargs,
     )
     return spaces.aoti_compile(exported)
 def load_model():
     upscaler_2 = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(device)
     return pipe, upscaler_2
+fa_processor = FlashAttentionProcessor()
 pipe, upscaler_2 = load_model()