qedit

Running on Zero

App Files Files Community

lenML commited on 17 days ago

Commit

6da8d6f

verified ·

1 Parent(s): 2a5c8e8

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -225

app.py CHANGED Viewed

@@ -11,203 +11,8 @@ from diffusers import FlowMatchEulerDiscreteScheduler, QwenImageEditPlusPipeline
 # from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
 # from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
-from huggingface_hub import InferenceClient
 import math
-import os
-import base64
-from io import BytesIO
-import json
-SYSTEM_PROMPT = '''
-# Edit Instruction Rewriter
-You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
-Please strictly follow the rewriting rules below:
-## 1. General Principles
-- Keep the rewritten prompt **concise and comprehensive**. Avoid overly long sentences and unnecessary descriptive language.
-- If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
-- Keep the main part of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
-- All added objects or modifications must align with the logic and style of the scene in the input images.
-- If multiple sub-images are to be generated, describe the content of each sub-image individually.
-## 2. Task-Type Handling Rules
-### 1. Add, Delete, Replace Tasks
-- If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
-- If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
-    > Original: "Add an animal"
-    > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
-- Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
-- For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
-### 2. Text Editing Tasks
-- All text content must be enclosed in English double quotes `" "`. Keep the original language of the text, and keep the capitalization.
-- Both adding new text and replacing existing text are text replacement tasks, For example:
-    - Replace "xx" to "yy"
-    - Replace the mask / bounding box to "yy"
-    - Replace the visual object to "yy"
-- Specify text position, color, and layout only if user has required.
-- If font is specified, keep the original language of the font.
-### 3. Human Editing Tasks
-- Make the smallest changes to the given user's prompt.
-- If changes to background, action, expression, camera shot, or ambient lighting are required, please list each modification individually.
-- **Edits to makeup or facial features / expression must be subtle, not exaggerated, and must preserve the subject's identity consistency.**
-    > Original: "Add eyebrows to the face"
-    > Rewritten: "Slightly thicken the person's eyebrows with little change, look natural."
-### 4. Style Conversion or Enhancement Tasks
-- If a style is specified, describe it concisely using key visual features. For example:
-    > Original: "Disco style"
-    > Rewritten: "1970s disco style: flashing lights, disco ball, mirrored walls, vibrant colors"
-- For style reference, analyze the original image and extract key characteristics (color, composition, texture, lighting, artistic style, etc.), integrating them into the instruction.
-- **Colorization tasks (including old photo restoration) must use the fixed template:**
-  "Restore and colorize the old photo."
-- Clearly specify the object to be modified. For example:
-    > Original: Modify the subject in Picture 1 to match the style of Picture 2.
-    > Rewritten: Change the girl in Picture 1 to the ink-wash style of Picture 2 — rendered in black-and-white watercolor with soft color transitions.
-### 5. Material Replacement
-- Clearly specify the object and the material. For example: "Change the material of the apple to papercut style."
-- For text material replacement, use the fixed template:
-    "Change the material of text "xxxx" to laser style"
-### 6. Logo/Pattern Editing
-- Material replacement should preserve the original shape and structure as much as possible. For example:
-   > Original: "Convert to sapphire material"
-   > Rewritten: "Convert the main subject in the image to sapphire material, preserving similar shape and structure"
-- When migrating logos/patterns to new scenes, ensure shape and structure consistency. For example:
-   > Original: "Migrate the logo in the image to a new scene"
-   > Rewritten: "Migrate the logo in the image to a new scene, preserving similar shape and structure"
-### 7. Multi-Image Tasks
-- Rewritten prompts must clearly point out which image's element is being modified. For example:
-    > Original: "Replace the subject of picture 1 with the subject of picture 2"
-    > Rewritten: "Replace the girl of picture 1 with the boy of picture 2, keeping picture 2's background unchanged"
-- For stylization tasks, describe the reference image's style in the rewritten prompt, while preserving the visual content of the source image.
-## 3. Rationale and Logic Check
-- Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" requires logical correction.
-- Supplement missing critical information: e.g., if position is unspecified, choose a reasonable area based on composition (near subject, blank space, center/edge, etc.).
-# Output Format Example
-```json
-{
-   "Rewritten": "..."
-}
-'''
-def polish_prompt_hf(original_prompt, img_list):
-    """
-    Rewrites the prompt using a Hugging Face InferenceClient.
-    Supports multiple images via img_list.
-    """
-    # Ensure HF_TOKEN is set
-    api_key = os.environ.get("inference_providers")
-    if not api_key:
-        print("Warning: HF_TOKEN not set. Falling back to original prompt.")
-        return original_prompt
-    prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {original_prompt}\n\nRewritten Prompt:"
-    system_prompt = "you are a helpful assistant, you should provide useful answers to users."
-    try:
-        # Initialize the client
-        client = InferenceClient(
-            provider="nebius",
-            api_key=api_key,
-        )
-        # Convert list of images to base64 data URLs
-        image_urls = []
-        if img_list is not None:
-            # Ensure img_list is actually a list
-            if not isinstance(img_list, list):
-                img_list = [img_list]
-            for img in img_list:
-                image_url = None
-                # If img is a PIL Image
-                if hasattr(img, 'save'):  # Check if it's a PIL Image
-                    buffered = BytesIO()
-                    img.save(buffered, format="PNG")
-                    img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-                    image_url = f"data:image/png;base64,{img_base64}"
-                # If img is already a file path (string)
-                elif isinstance(img, str):
-                    with open(img, "rb") as image_file:
-                        img_base64 = base64.b64encode(image_file.read()).decode('utf-8')
-                    image_url = f"data:image/png;base64,{img_base64}"
-                else:
-                    print(f"Warning: Unexpected image type: {type(img)}, skipping...")
-                    continue
-                if image_url:
-                    image_urls.append(image_url)
-        # Build the content array with text first, then all images
-        content = [
-            {
-                "type": "text",
-                "text": prompt
-            }
-        ]
-        # Add all images to the content
-        for image_url in image_urls:
-            content.append({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            })
-        # Format the messages for the chat completions API
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {
-                "role": "user",
-                "content": content
-            }
-        ]
-        # Call the API
-        completion = client.chat.completions.create(
-            model="Qwen/Qwen2.5-VL-72B-Instruct",
-            messages=messages,
-        )
-        # Parse the response
-        result = completion.choices[0].message.content
-        # Try to extract JSON if present
-        if '"Rewritten"' in result:
-            try:
-                # Clean up the response
-                result = result.replace('```json', '').replace('```', '')
-                result_json = json.loads(result)
-                polished_prompt = result_json.get('Rewritten', result)
-            except:
-                polished_prompt = result
-        else:
-            polished_prompt = result
-        polished_prompt = polished_prompt.strip().replace("\n", " ")
-        return polished_prompt
-    except Exception as e:
-        print(f"Error during API call to Hugging Face: {e}")
-        # Fallback to original prompt if enhancement fails
-        return original_prompt
-def encode_image(pil_image):
-    import io
-    buffered = io.BytesIO()
-    pil_image.save(buffered, format="PNG")
-    return base64.b64encode(buffered.getvalue()).decode("utf-8")
 # --- Model Loading ---
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -215,7 +20,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # Scheduler configuration for Lightning
 scheduler_config = {
     "base_image_seq_len": 256,
-    "base_shift": math.log(3),
     "invert_sigmas": False,
     "max_image_seq_len": 8192,
     "max_shift": math.log(3),
@@ -262,7 +67,9 @@ def use_output_as_input(output_images):
 # --- Main Inference Function (with hardcoded negative prompt) ---
 @spaces.GPU()
 def infer(
-    images,
     prompt,
     seed=42,
     randomize_seed=False,
@@ -270,7 +77,6 @@ def infer(
     num_inference_steps=4,
     height=None,
     width=None,
-    rewrite_prompt=True,
     num_images_per_prompt=1,
     progress=gr.Progress(track_tqdm=True),
 ):
@@ -303,28 +109,25 @@ def infer(
     # Set up the generator for reproducibility
     generator = torch.Generator(device=device).manual_seed(seed)
-    # Load input images into PIL Images
     pil_images = []
-    if images is not None:
-        for item in images:
-            try:
-                if isinstance(item[0], Image.Image):
-                    pil_images.append(item[0].convert("RGB"))
-                elif isinstance(item[0], str):
-                    pil_images.append(Image.open(item[0]).convert("RGB"))
-                elif hasattr(item, "name"):
-                    pil_images.append(Image.open(item.name).convert("RGB"))
-            except Exception:
-                continue
     if height==256 and width==256:
         height, width = None, None
     print(f"Calling pipeline with prompt: '{prompt}'")
     print(f"Negative Prompt: '{negative_prompt}'")
     print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}")
-    if rewrite_prompt and len(pil_images) > 0:
-        prompt = polish_prompt_hf(prompt, pil_images)
-        print(f"Rewritten Prompt: {prompt}")
     # Generate the image
@@ -375,15 +178,14 @@ with gr.Blocks(css=css) as demo:
         """)
         with gr.Row():
             with gr.Column():
-                input_images = gr.Gallery(label="Input Images",
-                                          show_label=False,
-                                          type="pil",
-                                          interactive=True)
             with gr.Column():
-                result = gr.Gallery(label="Result", show_label=False, type="pil", interactive=False)
                 # Add this button right after the result gallery - initially hidden
-                use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)
         with gr.Row():
             prompt = gr.Text(
@@ -440,9 +242,6 @@ with gr.Blocks(css=css) as demo:
                     step=8,
                     value=None,
                 )
-                rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=True)
         # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
@@ -450,7 +249,9 @@ with gr.Blocks(css=css) as demo:
         triggers=[run_button.click, prompt.submit],
         fn=infer,
         inputs=[
-            input_images,
             prompt,
             seed,
             randomize_seed,
@@ -458,7 +259,6 @@ with gr.Blocks(css=css) as demo:
             num_inference_steps,
             height,
             width,
-            rewrite_prompt,
         ],
         outputs=[result, seed, use_output_btn],  # Added use_output_btn to outputs
     )
@@ -467,7 +267,7 @@ with gr.Blocks(css=css) as demo:
     use_output_btn.click(
         fn=use_output_as_input,
         inputs=[result],
-        outputs=[input_images]
     )
 if __name__ == "__main__":

 # from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
 # from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 import math
 # --- Model Loading ---
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Scheduler configuration for Lightning
 scheduler_config = {
     "base_image_seq_len": 256,
+    "base_shift": math.log(5),
     "invert_sigmas": False,
     "max_image_seq_len": 8192,
     "max_shift": math.log(3),
 # --- Main Inference Function (with hardcoded negative prompt) ---
 @spaces.GPU()
 def infer(
+    image_1,
+    image_2,
+    image_3,
     prompt,
     seed=42,
     randomize_seed=False,
     num_inference_steps=4,
     height=None,
     width=None,
     num_images_per_prompt=1,
     progress=gr.Progress(track_tqdm=True),
 ):
     # Set up the generator for reproducibility
     generator = torch.Generator(device=device).manual_seed(seed)
+    # Load input images into a list of PIL Images
     pil_images = []
+    for item in [image_1, image_2, image_3]:
+        if item is None: continue
+        try:
+            if isinstance(item[0], Image.Image):
+                pil_images.append(item[0].convert("RGB"))
+            elif isinstance(item[0], str):
+                pil_images.append(Image.open(item[0]).convert("RGB"))
+            elif hasattr(item, "name"):
+                pil_images.append(Image.open(item.name).convert("RGB"))
+        except Exception:
+            continue
     if height==256 and width==256:
         height, width = None, None
     print(f"Calling pipeline with prompt: '{prompt}'")
     print(f"Negative Prompt: '{negative_prompt}'")
     print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}")
     # Generate the image
         """)
         with gr.Row():
             with gr.Column():
+                image_1 = gr.Image(label="image 1", type="pil", interactive=True)
+                image_2 = gr.Image(label="image 2", type="pil", interactive=True)
+                image_3 = gr.Image(label="image 3", type="pil", interactive=True)
             with gr.Column():
+                result = gr.Image(label="Result", type="pil", interactive=False)
                 # Add this button right after the result gallery - initially hidden
+                use_output_btn = gr.Button("↗️ Use as image 1", variant="secondary", size="sm", visible=False)
         with gr.Row():
             prompt = gr.Text(
                     step=8,
                     value=None,
                 )
         # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
         triggers=[run_button.click, prompt.submit],
         fn=infer,
         inputs=[
+            image_1,
+            image_2,
+            image_3,
             prompt,
             seed,
             randomize_seed,
             num_inference_steps,
             height,
             width,
         ],
         outputs=[result, seed, use_output_btn],  # Added use_output_btn to outputs
     )
     use_output_btn.click(
         fn=use_output_as_input,
         inputs=[result],
+        outputs=[image_1]
     )
 if __name__ == "__main__":