Qwen-Image-ControlNet-Inpainting

Running on Zero

App Files Files Community

linoyts HF Staff commited on Sep 9, 2025

Commit

09a6cb1

verified ·

1 Parent(s): d23b320

Update app.py

Browse files

Files changed (1) hide show

app.py +294 -77

app.py CHANGED Viewed

@@ -1,87 +1,304 @@
-with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
-    gr.HTML("<h1 style='text-align: center'>Qwen-Image with InstantX Inpainting ControlNet</style>")
-    gr.Markdown(
-        "Generate images with the [InstantX/Qwen-Image-ControlNet-Inpainting](https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting) that takes depth, pose and canny conditionings"
-    )
-    with gr.Row():
-        with gr.Column():
-            edit_image = gr.ImageEditor(
-                label='Upload and draw mask for inpainting',
-                type='pil',
-                sources=["upload", "webcam"],
-                image_mode='RGB',
-                layers=False,
-                brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"),
-                height=600
-            )
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt (e.g., 'change the hat to red')",
-                container=False,
-            )
-            negative_prompt = gr.Text(
-                label="Negative Prompt",
-                show_label=True,
-                max_lines=1,
-                placeholder="Enter what you don't want (optional)",
-                container=False,
-                value="",
-                visible=False
-            )
-            run_button = gr.Button("Run")
-        with gr.Column():
-            result = gr.ImageSlider(label="Result", show_label=False, interactive=False)
-            use_as_input_button = gr.Button("🔄 Use as Input Image", visible=False, variant="secondary")
-    with gr.Accordion("Advanced Settings", open=False):
-        seed = gr.Slider(
-            label="Seed",
-            minimum=0,
-            maximum=MAX_SEED,
-            step=1,
-            value=42,
         )
-        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
         with gr.Row():
-            strength = gr.Slider(
-                label="Conditioning Scale",
-                minimum=0.0,
-                maximum=1.0,
-                step=0.1,
-                value=1.0,
-                info="Controls how much the inpainted region should change"
-            )
-            true_cfg_scale = gr.Slider(
-                label="True CFG Scale",
-                minimum=1.0,
-                maximum=10.0,
-                step=0.5,
-                value=4.0,
-                info="Classifier-free guidance scale"
-            )
-            num_inference_steps = gr.Slider(
-                label="Number of inference steps",
-                minimum=1,
-                maximum=50,
                 step=1,
-                value=30,
             )
-        rewrite_prompt = gr.Checkbox(
-            label="Enhance prompt (using HF Inference)",
-            value=True
-        )
-    # Event handlers for reuse functionality (MUST be inside gr.Blocks context with 4 spaces)
     use_as_input_button.click(
         fn=use_output_as_input,
         inputs=[result],
@@ -97,9 +314,9 @@ with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
         outputs=result,
         show_api=False
     ).then(
-        fn=infer,
-        inputs=[edit_image, prompt, negative_prompt, seed, randomize_seed, strength, num_inference_steps, true_cfg_scale, rewrite_prompt],
-        outputs=[result, seed]
     ).then(
         fn=lambda: gr.update(visible=True),
         inputs=None,

+import gradio as gr
+import numpy as np
+import spaces
+import torch
+import random
+import os
+# from diffusers import QwenImageEditInpaintPipeline
+from optimization import optimize_pipeline_
+from diffusers.utils import load_image
+from diffusers import QwenImageControlNetModel, QwenImageControlNetInpaintPipeline
+import math
+from huggingface_hub import InferenceClient
+from PIL import Image
+# Set environment variable for parallel loading
+# os.environ["HF_ENABLE_PARALLEL_LOADING"] = "YES"
+# --- Prompt Enhancement using Hugging Face InferenceClient ---
+def polish_prompt_hf(original_prompt, system_prompt):
+    """
+    Rewrites the prompt using a Hugging Face InferenceClient.
+    """
+    # Ensure HF_TOKEN is set
+    api_key = os.environ.get("HF_TOKEN")
+    if not api_key:
+        print("Warning: HF_TOKEN not set. Falling back to original prompt.")
+        return original_prompt
+    try:
+        # Initialize the client
+        client = InferenceClient(
+            provider="cerebras",
+            api_key=api_key,
+        )
+        # Format the messages for the chat completions API
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": original_prompt}
+        ]
+        # Call the API
+        completion = client.chat.completions.create(
+            model="Qwen/Qwen3-235B-A22B-Instruct-2507",
+            messages=messages,
         )
+        # Parse the response
+        result = completion.choices[0].message.content
+        # Try to extract JSON if present
+        if '{"Rewritten"' in result:
+            try:
+                # Clean up the response
+                result = result.replace('```json', '').replace('```', '')
+                result_json = json.loads(result)
+                polished_prompt = result_json.get('Rewritten', result)
+            except:
+                polished_prompt = result
+        else:
+            polished_prompt = result
+        polished_prompt = polished_prompt.strip().replace("\n", " ")
+        return polished_prompt
+    except Exception as e:
+        print(f"Error during API call to Hugging Face: {e}")
+        # Fallback to original prompt if enhancement fails
+        return original_prompt
+def polish_prompt(prompt, img):
+    """
+    Main function to polish prompts for image editing using HF inference.
+    """
+    SYSTEM_PROMPT = '''
+# Edit Instruction Rewriter
+You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
+Please strictly follow the rewriting rules below:
+## 1. General Principles
+- Keep the rewritten prompt **concise**. Avoid overly long sentences and reduce unnecessary descriptive language.
+- If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
+- Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
+- All added objects or modifications must align with the logic and style of the edited input image's overall scene.
+## 2. Task Type Handling Rules
+### 1. Add, Delete, Replace Tasks
+- If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
+- If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
+    > Original: "Add an animal"
+    > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
+- Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
+- For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
+### 2. Text Editing Tasks
+- All text content must be enclosed in English double quotes " ". Do not translate or alter the original language of the text, and do not change the capitalization.
+- **For text replacement tasks, always use the fixed template:**
+    - Replace "xx" to "yy".
+    - Replace the xx bounding box to "yy".
+- If the user does not specify text content, infer and add concise text based on the instruction and the input image's context. For example:
+    > Original: "Add a line of text" (poster)
+    > Rewritten: "Add text "LIMITED EDITION" at the top center with slight shadow"
+- Specify text position, color, and layout in a concise way.
+### 3. Human Editing Tasks
+- Maintain the person's core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
+- If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
+- **For expression changes, they must be natural and subtle, never exaggerated.**
+- If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
+    - For background change tasks, emphasize maintaining subject consistency at first.
+- Example:
+    > Original: "Change the person's hat"
+    > Rewritten: "Replace the man's hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"
+### 4. Style Transformation or Enhancement Tasks
+- If a style is specified, describe it concisely with key visual traits. For example:
+    > Original: "Disco style"
+    > Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"
+- If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
+- **For coloring tasks, including restoring old photos, always use the fixed template:** "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
+- If there are other changes, place the style description at the end.
+## 3. Rationality and Logic Checks
+- Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.
+- Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).
+# Output Format
+Return only the rewritten instruction text directly, without JSON formatting or any other wrapper.
+'''
+    # Note: We're not actually using the image in the HF version,
+    # but keeping the interface consistent
+    full_prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {prompt}\n\nRewritten Prompt:"
+    return polish_prompt_hf(full_prompt, SYSTEM_PROMPT)
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 2048
+# --- Helper functions for reuse feature ---
+def clear_result():
+    """Clears the result image."""
+    return gr.update(value=None)
+def use_output_as_input(output_image):
+    """Sets the generated output as the new input image."""
+    if output_image is not None:
+        return gr.update(value=output_image[1])
+    return gr.update()
+base_model = "Qwen/Qwen-Image"
+controlnet_model = "InstantX/Qwen-Image-ControlNet-Inpainting"
+controlnet = QwenImageControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
+pipe = QwenImageControlNetInpaintPipeline.from_pretrained(
+    base_model, controlnet=controlnet, torch_dtype=torch.bfloat16
+)
+pipe.to("cuda")
+@spaces.GPU(duration=120)
+def infer(edit_images,
+          prompt,
+          negative_prompt=" ",
+          seed=42,
+          randomize_seed=False,
+          strength=1.0,
+          num_inference_steps=30,
+          true_cfg_scale=4.0,
+          rewrite_prompt=True,
+          progress=gr.Progress(track_tqdm=True)):
+    image = edit_images["background"]
+    mask = edit_images["layers"][0]
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    if rewrite_prompt:
+        prompt = polish_prompt(prompt, image)
+        print(f"Rewritten Prompt: {prompt}")
+    # Generate image using Qwen pipeline
+    result_image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        control_image=image,
+        control_mask=mask_image,
+        controlnet_conditioning_scale=strength,
+        num_inference_steps=num_inference_steps,
+        true_cfg_scale=true_cfg_scale,
+        generator=torch.Generator(device="cuda").manual_seed(seed)
+    ).images[0]
+    return [image,result_image], seed
+examples = [
+    "change the hat to red",
+    "make the background a beautiful sunset",
+    "replace the object with a flower vase",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 1024px;
+}
+#logo-title {
+    text-align: center;
+}
+#logo-title img {
+    width: 400px;
+}
+#edit_text{margin-top: -62px !important}
+"""
+with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
+        gr.HTML("<h1 style='text-align: center'>Qwen-Image with InstantX Inpainting ControlNet</style>")
+        gr.Markdown(
+            "Generate images with the [InstantX/Qwen-Image-ControlNet-Inpainting](https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting) that takes depth, pose and canny conditionings"
+        )
         with gr.Row():
+            with gr.Column():
+                edit_image = gr.ImageEditor(
+                    label='Upload and draw mask for inpainting',
+                    type='pil',
+                    sources=["upload", "webcam"],
+                    image_mode='RGB',
+                    layers=False,
+                    brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"),
+                    height=600
+                )
+                prompt = gr.Text(
+                    label="Prompt",
+                    show_label=False,
+                    max_lines=1,
+                    placeholder="Enter your prompt (e.g., 'change the hat to red')",
+                    container=False,
+                )
+                negative_prompt = gr.Text(
+                    label="Negative Prompt",
+                    show_label=True,
+                    max_lines=1,
+                    placeholder="Enter what you don't want (optional)",
+                    container=False,
+                    value="",
+                    visible=False
+                )
+                run_button = gr.Button("Run")
+            with gr.Column():
+                result = gr.ImageSlider(label="Result", show_label=False, interactive=False)
+                use_as_input_button = gr.Button("🔄 Use as Input Image", visible=False, variant="secondary")
+        with gr.Accordion("Advanced Settings", open=False):
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
                 step=1,
+                value=42,
             )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                strength = gr.Slider(
+                    label="Conditioning Scale",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.1,
+                    value=1.0,
+                    info="Controls how much the inpainted region should change"
+                )
+                true_cfg_scale = gr.Slider(
+                    label="True CFG Scale",
+                    minimum=1.0,
+                    maximum=10.0,
+                    step=0.5,
+                    value=4.0,
+                    info="Classifier-free guidance scale"
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=30,
+                )
+            rewrite_prompt = gr.Checkbox(
+                label="Enhance prompt (using HF Inference)",
+                value=True
+            )
+    # Event handlers for reuse functionality
     use_as_input_button.click(
         fn=use_output_as_input,
         inputs=[result],
         outputs=result,
         show_api=False
     ).then(
+        fn = infer,
+        inputs = [edit_image, prompt, negative_prompt, seed, randomize_seed, strength, num_inference_steps, true_cfg_scale, rewrite_prompt],
+        outputs = [result, seed]
     ).then(
         fn=lambda: gr.update(visible=True),
         inputs=None,