FLUX.2-dev

Running on Zero

App Files Files Community

Gemini899 commited on Jan 19

Commit

4d8d54b

verified ·

1 Parent(s): 175171d

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -126

app.py CHANGED Viewed

@@ -6,12 +6,9 @@ import random
 import spaces
 import torch
 from diffusers import Flux2Pipeline, Flux2Transformer2DModel
-from diffusers import BitsAndBytesConfig as DiffBitsAndBytesConfig
 import requests
 from PIL import Image
-import json
 import base64
-from huggingface_hub import InferenceClient
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -19,45 +16,25 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
-hf_client = InferenceClient(
-    api_key=os.environ.get("HF_TOKEN"),
-)
-VLM_MODEL = "baidu/ERNIE-4.5-VL-424B-A47B-Base-PT"
-SYSTEM_PROMPT_TEXT_ONLY = """You are an expert prompt engineer for FLUX.2 by Black Forest Labs. Rewrite user prompts to be more descriptive while strictly preserving their core subject and intent.
-Guidelines:
-1. Structure: Keep structured inputs structured (enhance within fields). Convert natural language to detailed paragraphs.
-2. Details: Add concrete visual specifics - form, scale, textures, materials, lighting (quality, direction, color), shadows, spatial relationships, and environmental context.
-3. Text in Images: Put ALL text in quotation marks, matching the prompt's language. Always provide explicit quoted text for objects that would contain text in reality (signs, labels, screens, etc.) - without it, the model generates gibberish.
-Output only the revised prompt and nothing else."""
-SYSTEM_PROMPT_WITH_IMAGES = """You are FLUX.2 by Black Forest Labs, an image-editing expert. You convert editing requests into one concise instruction (50-80 words, ~30 for brief requests).
-Rules:
-- Single instruction only, no commentary
-- Use clear, analytical language (avoid "whimsical," "cascading," etc.)
-- Specify what changes AND what stays the same (face, lighting, composition)
-- Reference actual image elements
-- Turn negatives into positives ("don't change X" → "keep X")
-- Make abstractions concrete ("futuristic" → "glowing cyan neon, metallic panels")
-- Keep content PG-13
-Output only the final instruction in plain text and nothing else."""
-def remote_text_encoder(prompts):
     from gradio_client import Client
-    client = Client("multimodalart/mistral-text-encoder")
-    result = client.predict(
-        prompt=prompts,
-        api_name="/encode_text"
-    )
-    # Load returns a tensor, usually on CPU by default
-    prompt_embeds = torch.load(result[0])
-    return prompt_embeds
 # Load model
 repo_id = "black-forest-labs/FLUX.2-dev"
@@ -76,85 +53,34 @@ pipe = Flux2Pipeline.from_pretrained(
 )
 pipe.to(device)
-# AOTI blocks temporarily disabled - HuggingFace needs to recompile for new ZeroGPU environment (PyTorch 2.9 + CUDA 12.8)
-# Re-enable once zerogpu-aoti/FLUX.2 is updated with compatible compiled blocks
 # spaces.aoti_blocks_load(pipe.transformer, "zerogpu-aoti/FLUX.2", variant="fa3")
-def image_to_data_uri(img):
-    buffered = io.BytesIO()
-    img.save(buffered, format="PNG")
-    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    return f"data:image/png;base64,{img_str}"
-def upsample_prompt_logic(prompt, image_list):
-    try:
-        if image_list and len(image_list) > 0:
-            # Image + Text Editing Mode
-            system_content = SYSTEM_PROMPT_WITH_IMAGES
-            # Construct user message with text and images
-            user_content = [{"type": "text", "text": prompt}]
-            for img in image_list:
-                data_uri = image_to_data_uri(img)
-                user_content.append({
-                    "type": "image_url",
-                    "image_url": {"url": data_uri}
-                })
-            messages = [
-                {"role": "system", "content": system_content},
-                {"role": "user", "content": user_content}
-            ]
-        else:
-            # Text Only Mode
-            system_content = SYSTEM_PROMPT_TEXT_ONLY
-            messages = [
-                {"role": "system", "content": system_content},
-                {"role": "user", "content": prompt}
-            ]
-        completion = hf_client.chat.completions.create(
-            model=VLM_MODEL,
-            messages=messages,
-            max_tokens=1024
-        )
-        return completion.choices[0].message.content
-    except Exception as e:
-        print(f"Upsampling failed: {e}")
-        return prompt
 def update_dimensions_from_image(image_list):
-    """Update width/height sliders based on uploaded image aspect ratio.
-    Keeps one side at 1024 and scales the other proportionally, with both sides as multiples of 8."""
     if image_list is None or len(image_list) == 0:
-        return 1024, 1024  # Default dimensions
-    # Get the first image to determine dimensions
-    img = image_list[0][0]  # Gallery returns list of tuples (image, caption)
     img_width, img_height = img.size
     aspect_ratio = img_width / img_height
-    if aspect_ratio >= 1:  # Landscape or square
         new_width = 1024
         new_height = int(1024 / aspect_ratio)
-    else:  # Portrait
         new_height = 1024
         new_width = int(1024 * aspect_ratio)
-    # Round to nearest multiple of 8
     new_width = round(new_width / 8) * 8
     new_height = round(new_height / 8) * 8
-    # Ensure within valid range (minimum 256, maximum 1024)
     new_width = max(256, min(1024, new_width))
     new_height = max(256, min(1024, new_height))
     return new_width, new_height
-# Updated duration function to match generate_image arguments (including progress)
 def get_duration(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     num_images = 0 if image_list is None else len(image_list)
     step_duration = 1 + 0.8 * num_images
@@ -162,7 +88,6 @@ def get_duration(prompt_embeds, image_list, width, height, num_inference_steps,
 @spaces.GPU(duration=get_duration)
 def generate_image(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
-    # Move embeddings to GPU only when inside the GPU decorated function
     prompt_embeds = prompt_embeds.to(device)
     generator = torch.Generator(device=device).manual_seed(seed)
@@ -177,39 +102,28 @@ def generate_image(prompt_embeds, image_list, width, height, num_inference_steps
         "height": height,
     }
-    # Progress bar for the actual generation steps
     if progress:
         progress(0, desc="Starting generation...")
     image = pipe(**pipe_kwargs).images[0]
     return image
-def infer(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, prompt_upsampling=False, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
-    # Prepare image list (convert None or empty gallery to None)
     image_list = None
     if input_images is not None and len(input_images) > 0:
         image_list = []
         for item in input_images:
             image_list.append(item[0])
-    # 1. Upsampling (Network bound - No GPU needed)
-    final_prompt = prompt
-    if prompt_upsampling:
-        progress(0.05, desc="Upsampling prompt...")
-        final_prompt = upsample_prompt_logic(prompt, image_list)
-        print(f"Original Prompt: {prompt}")
-        print(f"Upsampled Prompt: {final_prompt}")
-    # 2. Text Encoding (Network bound - No GPU needed)
     progress(0.1, desc="Encoding prompt...")
-    # This returns CPU tensors
-    prompt_embeds = remote_text_encoder(final_prompt)
-    # 3. Image Generation (GPU bound)
     progress(0.3, desc="Waiting for GPU...")
     image = generate_image(
         prompt_embeds,
@@ -232,7 +146,6 @@ examples = [
 ]
 examples_images = [
-    # ["Replace the top of the person from image 1 with the one from image 2", ["person1.webp", "woman2.webp"]],
     ["The person from image 1 is petting the cat from image 2, the bird from image 3 is next to them", ["woman1.webp", "cat_window.webp", "bird.webp"]]
 ]
@@ -275,12 +188,6 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
                     )
                 with gr.Accordion("Advanced Settings", open=False):
-                    prompt_upsampling = gr.Checkbox(
-                        label="Prompt Upsampling",
-                        value=True,
-                        info="Automatically enhance the prompt using a VLM"
-                    )
                     seed = gr.Slider(
                         label="Seed",
                         minimum=0,
@@ -292,7 +199,6 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
                     randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                     with gr.Row():
                         width = gr.Slider(
                             label="Width",
                             minimum=256,
@@ -310,7 +216,6 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
                         )
                     with gr.Row():
                         num_inference_steps = gr.Slider(
                             label="Number of inference steps",
                             minimum=1,
@@ -327,10 +232,8 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
                             value=4,
                         )
             with gr.Column():
                 result = gr.Image(label="Result", show_label=False)
         gr.Examples(
             examples=examples,
@@ -350,7 +253,6 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
             cache_mode="lazy"
         )
-    # Auto-update dimensions when images are uploaded
     input_images.upload(
         fn=update_dimensions_from_image,
         inputs=[input_images],
@@ -360,7 +262,7 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
-        inputs=[prompt, input_images, seed, randomize_seed, width, height, num_inference_steps, guidance_scale, prompt_upsampling],
         outputs=[result, seed]
     )

 import spaces
 import torch
 from diffusers import Flux2Pipeline, Flux2Transformer2DModel
 import requests
 from PIL import Image
 import base64
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
+def remote_text_encoder(prompts, max_retries=3):
     from gradio_client import Client
+    import time
+    for attempt in range(max_retries):
+        try:
+            client = Client("multimodalart/mistral-text-encoder")
+            result = client.predict(
+                prompt=prompts,
+                api_name="/encode_text"
+            )
+            prompt_embeds = torch.load(result[0])
+            return prompt_embeds
+        except Exception as e:
+            print(f"Text encoder attempt {attempt + 1}/{max_retries} failed: {e}")
+            if attempt < max_retries - 1:
+                time.sleep(2)
+            else:
+                raise Exception(f"Text encoder failed after {max_retries} attempts: {e}")
 # Load model
 repo_id = "black-forest-labs/FLUX.2-dev"
 )
 pipe.to(device)
+# AOTI blocks temporarily disabled - HuggingFace needs to recompile for new ZeroGPU environment
 # spaces.aoti_blocks_load(pipe.transformer, "zerogpu-aoti/FLUX.2", variant="fa3")
 def update_dimensions_from_image(image_list):
+    """Update width/height sliders based on uploaded image aspect ratio."""
     if image_list is None or len(image_list) == 0:
+        return 1024, 1024
+    img = image_list[0][0]
     img_width, img_height = img.size
     aspect_ratio = img_width / img_height
+    if aspect_ratio >= 1:
         new_width = 1024
         new_height = int(1024 / aspect_ratio)
+    else:
         new_height = 1024
         new_width = int(1024 * aspect_ratio)
     new_width = round(new_width / 8) * 8
     new_height = round(new_height / 8) * 8
     new_width = max(256, min(1024, new_width))
     new_height = max(256, min(1024, new_height))
     return new_width, new_height
 def get_duration(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     num_images = 0 if image_list is None else len(image_list)
     step_duration = 1 + 0.8 * num_images
 @spaces.GPU(duration=get_duration)
 def generate_image(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     prompt_embeds = prompt_embeds.to(device)
     generator = torch.Generator(device=device).manual_seed(seed)
         "height": height,
     }
     if progress:
         progress(0, desc="Starting generation...")
     image = pipe(**pipe_kwargs).images[0]
     return image
+def infer(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     image_list = None
     if input_images is not None and len(input_images) > 0:
         image_list = []
         for item in input_images:
             image_list.append(item[0])
+    # Text Encoding
     progress(0.1, desc="Encoding prompt...")
+    prompt_embeds = remote_text_encoder(prompt)
+    # Image Generation
     progress(0.3, desc="Waiting for GPU...")
     image = generate_image(
         prompt_embeds,
 ]
 examples_images = [
     ["The person from image 1 is petting the cat from image 2, the bird from image 3 is next to them", ["woman1.webp", "cat_window.webp", "bird.webp"]]
 ]
                     )
                 with gr.Accordion("Advanced Settings", open=False):
                     seed = gr.Slider(
                         label="Seed",
                         minimum=0,
                     randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                     with gr.Row():
                         width = gr.Slider(
                             label="Width",
                             minimum=256,
                         )
                     with gr.Row():
                         num_inference_steps = gr.Slider(
                             label="Number of inference steps",
                             minimum=1,
                             value=4,
                         )
             with gr.Column():
                 result = gr.Image(label="Result", show_label=False)
         gr.Examples(
             examples=examples,
             cache_mode="lazy"
         )
     input_images.upload(
         fn=update_dimensions_from_image,
         inputs=[input_images],
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
+        inputs=[prompt, input_images, seed, randomize_seed, width, height, num_inference_steps, guidance_scale],
         outputs=[result, seed]
     )