Spaces:

concauu
/

image_generator

Runtime error

App Files Files Community

concauu commited on Mar 23, 2025

Commit

2b19647

verified ·

1 Parent(s): 0e160be

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -97

app.py CHANGED Viewed

@@ -14,65 +14,11 @@ from live_preview_helpers import calculate_shift, retrieve_timesteps, flux_pipe_
 from io import BytesIO
 import base64
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
-###
-# Step 2: Modified pipeline class with proper component registration
-class T5FluxPipeline(DiffusionPipeline):
-    def __init__(self, text_encoder, tokenizer, vae, unet, scheduler):
-        super().__init__()
-        self.device = device
-        self.dtype = dtype
-        self.register_modules(
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            vae=vae,
-            unet=unet,
-            scheduler=scheduler
-        )
-        self.text_projection = torch.nn.Linear(768, 4096).to(device=device, dtype=dtype)
-        torch.nn.init.normal_(self.text_projection.weight, std=0.02)
-        torch.nn.init.zeros_(self.text_projection.bias)
-    def encode_prompt(self, prompt, device, num_images_per_prompt=1,
-                    do_classifier_free_guidance=False, negative_prompt=None):
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=512,
-            truncation=True,
-            return_tensors="pt",
-        ).to(device)
-        text_embeddings = self.text_encoder(**text_inputs).last_hidden_state
-        text_embeddings = self.text_projection(text_embeddings)
-        pooled_embeddings = text_embeddings.mean(dim=1)
-        if do_classifier_free_guidance:
-            uncond_input = self.tokenizer(
-                [negative_prompt] if negative_prompt else [""],
-                padding="max_length",
-                max_length=512,
-                truncation=True,
-                return_tensors="pt",
-            ).to(device)
-            uncond_embeddings = self.text_projection(
-                self.text_encoder(**uncond_input).last_hidden_state
-            )
-            uncond_pooled = uncond_embeddings.mean(dim=1)
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-            pooled_embeddings = torch.cat([uncond_pooled, pooled_embeddings])
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-        pooled_embeddings = pooled_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-        return text_embeddings, pooled_embeddings, text_inputs.input_ids
-###
 os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '120'
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def get_hf_token(encrypted_token):
     # Retrieve the decryption key from an environment variable
     key = "K4FlQbffvTcDxT2FIhrOPV1eue6ia45FFR3kqp2hHbM="
@@ -95,38 +41,72 @@ decrypted_token = get_hf_token("gAAAAABn3GfShExoJd50nau3B5ZJNiQ9dRD1ACO3XXMwVaIQ
 login(token=decrypted_token)
 groq_client = Groq(api_key="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")
-vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
-unet = UNet2DConditionModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="unet")
-scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="scheduler")
 taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).to(device)
 good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=dtype).to(device)
-# Initialize pipeline with correct config
-try:
-    t5_tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
-    t5_text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base").to(device, dtype=dtype)
-    pipe = DiffusionPipeline.from_pretrained(
-        "black-forest-labs/FLUX.1-dev",
-        custom_pipeline=T5FluxPipeline,
-        text_encoder=t5_text_encoder,
-        tokenizer=t5_tokenizer,
-        torch_dtype=dtype,
-        safety_checker=None,
-        requires_safety_checker=False
-    ).to(device)
-    pipe.text_projection = pipe.text_projection.to(device, dtype=dtype)
-    torch.cuda.empty_cache()
-except Exception as e:
-    print(f"Model loading error: {str(e)}")
-    raise
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
-pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(pipe)
 # History functions
 def append_to_history(image, prompt, seed, width, height, guidance_scale, steps, history):
@@ -182,28 +162,27 @@ def create_history_html(history):
 @spaces.GPU(duration=75)
-def infer(prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
-    # Force PIL output
-    final_image = None
-    for img in pipe.flux_pipe_call_that_returns_an_iterable_of_images(
-        prompt=prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-        output_type="pil",
-        good_vae=good_vae,
-    ):
-        final_image = img  # Keep updating until we get the final image
-        yield img, seed  # Live preview
-    # Return the final image explicitly
-    yield final_image, seed

 from io import BytesIO
 import base64
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
 os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '120'
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def get_hf_token(encrypted_token):
     # Retrieve the decryption key from an environment variable
     key = "K4FlQbffvTcDxT2FIhrOPV1eue6ia45FFR3kqp2hHbM="
 login(token=decrypted_token)
 groq_client = Groq(api_key="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")
+# Load T5 components for longer context
+t5_tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-base", model_max_length=512)
+t5_text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base").to(device, dtype=dtype)
+# Add projection layer to match CLIP's embedding dimensions
+class TextProjection(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.proj = torch.nn.Linear(768, 768)  # T5-base to CLIP dimensions
+        torch.nn.init.normal_(self.proj.weight, std=0.02)
+    def forward(self, x):
+        return self.proj(x.to(dtype))
+# Initialize pipeline components
 taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).to(device)
 good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=dtype).to(device)
+# Custom pipeline with T5 support
+pipe = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    text_encoder=t5_text_encoder,
+    tokenizer=t5_tokenizer,
+    torch_dtype=dtype,
+    vae=taef1,
+    safety_checker=None
+).to(device)
+# Add projection layer to pipeline
+pipe.text_projection = TextProjection().to(device, dtype=dtype)
+torch.cuda.empty_cache()
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
+# Monkey-patch the text encoding method
+def custom_encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None):
+    text_inputs = self.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=512,
+        truncation=True,
+        return_tensors="pt",
+    ).to(device)
+    text_embeddings = self.text_encoder(**text_inputs).last_hidden_state
+    text_embeddings = self.text_projection(text_embeddings)
+    if do_classifier_free_guidance:
+        uncond_input = self.tokenizer(
+            [negative_prompt] if negative_prompt else [""],
+            padding="max_length",
+            max_length=512,
+            truncation=True,
+            return_tensors="pt",
+        ).to(device)
+        uncond_embeddings = self.text_projection(
+            self.text_encoder(**uncond_input).last_hidden_state
+        )
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    return text_embeddings
+pipe._encode_prompt = custom_encode_prompt.__get__(pipe)
+pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(pipe)
 # History functions
 def append_to_history(image, prompt, seed, width, height, guidance_scale, steps, history):
 @spaces.GPU(duration=75)
+def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024,
+         guidance_scale=3.5, num_inference_steps=28, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
+    # Truncate prompt to 512 tokens if needed
+    tokens = t5_tokenizer.encode(prompt)[:512]
+    processed_prompt = t5_tokenizer.decode(tokens, skip_special_tokens=True)
+    for img in pipe.flux_pipe_call_that_returns_an_iterable_of_images(
+            prompt=processed_prompt,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            width=width,
+            height=height,
+            generator=generator,
+            output_type="pil",
+            good_vae=good_vae,
+        ):
+            yield img, seed