Gemini-Image-Edit

Running

App Files Files Community

Varhal commited on May 2, 2025

Commit

b35c170

verified ·

1 Parent(s): 6fd2189

updated generation flow (added prompt to get tags and description)

Browse files

Files changed (1) hide show

app.py +310 -103

app.py CHANGED Viewed

@@ -7,107 +7,315 @@ from PIL import Image, ImageDraw, ImageFont
 import gradio as gr
 import base64
 import mimetypes
 from google import genai
-from google.genai import types
-# Функція для збереження бінарного файлу (залишаємо без змін)
 def save_binary_file(file_name, data):
-    with open(file_name, "wb") as f:
-        f.write(data)
-# Модифікована функція generate - прибираємо api_key як параметр
 def generate(text, file_name, model="gemini-2.0-flash-exp"):
-    # Ініціалізуємо клієнта, читаючи ключ зі змінної оточення geminigoogle
-    # Переконайтеся, що змінна geminigoogle встановлена у ваших налаштуваннях Space
     api_key = os.environ.get("geminigoogle")
     if not api_key:
-        raise ValueError("GEMINI_API_KEY environment variable (geminigoogle) not set.")
-    client = genai.Client(api_key=api_key)
-    # Решта функції generate залишається без змін
-    files = [ client.files.upload(file=file_name) ]
-    contents = [
-        types.Content(
-            role="user",
-            parts=[
-                types.Part.from_uri(
-                    file_uri=files[0].uri,
-                    mime_type=files[0].mime_type,
-                ),
-                types.Part.from_text(text=text),
-            ],
-        ),
-    ]
-    generate_content_config = types.GenerateContentConfig(
-        temperature=1,
-        top_p=0.95,
-        top_k=40,
-        max_output_tokens=8192,
-        response_modalities=["image", "text"],
-        response_mime_type="text/plain",
-    )
-    text_response = ""
-    image_path = None
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-        temp_path = tmp.name
-        for chunk in client.models.generate_content_stream(
-            model=model,
-            contents=contents,
-            config=generate_content_config,
-        ):
-            if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts:
-                continue
-            candidate = chunk.candidates[0].content.parts[0]
-            text_part = getattr(candidate, "text", "")
-            if text_part:
-                text_response += text_part + "\n"
-            if candidate.inline_data:
-                save_binary_file(temp_path, candidate.inline_data.data)
-                print(f"File of mime type {candidate.inline_data.mime_type} saved to: {temp_path} and prompt input: {text}")
-                image_path = temp_path
-                break
-        # Видаляємо завантажені файли після використання
-        del files
     return image_path, text_response
-# Модифікована функція process_image_and_prompt - прибираємо gemini_api_key як параметр
-def process_image_and_prompt(composite_pil, prompt):
     try:
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             composite_path = tmp.name
             composite_pil.save(composite_path)
-        file_name = composite_path
-        input_text = prompt
-        model = "gemini-2.0-flash-exp" # Модель вказується тут
-        # Викликаємо generate без api_key
-        image_path, text_response = generate(text=input_text, file_name=file_name, model=model)
-        if image_path:
-            result_img = Image.open(image_path)
-            if result_img.mode == "RGBA":
-                result_img = result_img.convert("RGB")
-            return [result_img], text_response
         else:
-            return None, text_response
     except Exception as e:
-        # Важливо видалити тимчасовий файл у разі помилки
-        if 'composite_path' in locals() and os.path.exists(composite_path):
-             os.remove(composite_path)
-        raise gr.Error(f"Error Getting {e}", duration=5)
-# Gradio інтерфейс
 with gr.Blocks( # css_paths="style.css", # Тимчасово закоментували цей рядок
     ) as demo:
     gr.HTML(
@@ -127,19 +335,23 @@ with gr.Blocks( # css_paths="style.css", # Тимчасово закоменту
     """
     )
-    # Прибираємо секцію API Configuration або змінюємо її опис, оскільки ключ більше не вводиться
     with gr.Accordion("⚠️ API Configuration ⚠️", open=False, elem_classes="config-accordion"):
         gr.Markdown("""
-    - **Ваш Gemini API ключ має бути збережений у змінній оточення `geminigoogle` в налаштуваннях Hugging Face Space.**
-    - ❗ Іноді модель повертає текст замість зображення.
     """)
     with gr.Accordion("📌 Usage Instructions", open=False, elem_classes="instructions-accordion"):
         gr.Markdown("""
     ### 📌 Usage
-      - Upload an image and enter a prompt to generate outputs.
-      - If text is returned instead of an image, it will appear in the text output.
-      - Upload Only PNG Image
       - ❌ **Do not use NSFW images!**
     """)
@@ -148,43 +360,38 @@ with gr.Blocks( # css_paths="style.css", # Тимчасово закоменту
             image_input = gr.Image(
                 type="pil",
                 label="Upload Image",
-                image_mode="RGBA",
                 elem_id="image-input",
                 elem_classes="upload-box"
             )
-            # Прибираємо поле введення API ключа з інтерфейсу
-            # gemini_api_key = gr.Textbox(
-            #     lines=1,
-            #     placeholder="Enter Gemini API Key (optional)",
-            #     label="Gemini API Key (optional)",
-            #     elem_classes="api-key-input"
-            # )
             prompt_input = gr.Textbox(
                 lines=2,
-                placeholder="Enter prompt here...",
-                label="Prompt",
                 elem_classes="prompt-input"
             )
-            submit_btn = gr.Button("Generate", elem_classes="generate-btn")
         with gr.Column(elem_classes="output-column"):
-            output_gallery = gr.Gallery(label="Generated Outputs", elem_classes="output-gallery")
             output_text = gr.Textbox(
-                label="Gemini Output",
-                placeholder="Text response will appear here if no image is generated.",
-                elem_classes="output-text"
             )
-    # Налаштовуємо взаємодію - прибираємо gemini_api_key з inputs
     submit_btn.click(
         fn=process_image_and_prompt,
-        inputs=[image_input, prompt_input], # Передаємо лише image_input та prompt_input
         outputs=[output_gallery, output_text],
     )
     gr.Markdown("## Try these examples", elem_classes="gr-examples-header")
-    # Приклади залишаємо без змін, API ключ в них не потрібен
     examples = [
         ["data/1.webp", 'change text to "AMEER"'],
         ["data/2.webp", "remove the spoon from hand only"],
@@ -198,7 +405,7 @@ with gr.Blocks( # css_paths="style.css", # Тимчасово закоменту
     gr.Examples(
         examples=examples,
-        inputs=[image_input, prompt_input], # Приклади також не потребують API ключа
         elem_id="examples-grid"
     )

 import gradio as gr
 import base64
 import mimetypes
+# Make sure you have installed the google-generativeai library
+# pip install google-generativeai Pillow gradio
 from google import genai
+from google.genai import types # Using the newer client API structure if available
+# Function to save binary file (kept as is)
 def save_binary_file(file_name, data):
+    """Saves binary data to a specified file."""
+    try:
+        with open(file_name, "wb") as f:
+            f.write(data)
+        # print(f"Binary data saved successfully to {file_name}")
+    except Exception as e:
+        print(f"Error saving binary data to {file_name}: {e}")
+        raise # Re-raise the exception after printing
+# Modified generate function to handle stream and collect both text and image
 def generate(text, file_name, model="gemini-2.0-flash-exp"):
+    """
+    Sends image and text prompt to the Gemini model and streams the response.
+    Collects all text parts and saves the first image part encountered.
+    Returns the path to the generated image and the accumulated text response.
+    """
     api_key = os.environ.get("geminigoogle")
     if not api_key:
+        # Use gr.Error for Gradio interface display
+        raise gr.Error("GEMINI_API_KEY environment variable (geminigoogle) not set.", duration=10)
+    # Configure the generative AI library
+    # This is the recommended way to configure the API key
+    genai.configure(api_key=api_key)
+    client = None # Placeholder for the client if needed for file upload
+    uploaded_file = None # To store the reference to the uploaded file
+    temp_generated_img_path = None # Path for saving generated image data
+    try:
+        # Attempt to use the genai.Client if available for file upload
+        # This is the method used in your original code, so we'll keep it.
+        # If this fails, consider falling back to models directly if they accept paths/bytes.
+        try:
+            client = genai.Client(api_key=api_key)
+            print("genai.Client initialized successfully.")
+        except Exception as e:
+            print(f"Warning: Failed to initialize genai.Client ({e}). Attempting direct model access.")
+            # In some library versions, you might interact directly via genai.get_model
+            # For this specific code structure using client.files.upload, the Client is needed.
+            # If the Client fails, file upload will likely fail too.
+            client = None
+            raise gr.Error(f"Failed to initialize Gemini client: {e}", duration=10)
+        # Upload the input file to Google's service using the client
+        if client and hasattr(client, 'files'):
+            try:
+                print(f"Attempting to upload input file: {file_name}")
+                # Use a loop with retry for file upload as it can sometimes be flaky
+                upload_attempts = 3
+                for i in range(upload_attempts):
+                    try:
+                        uploaded_file = client.files.upload(file=file_name)
+                        print(f"Input file uploaded successfully: {uploaded_file.uri}")
+                        break # Exit retry loop on success
+                    except Exception as upload_e:
+                         if i < upload_attempts - 1:
+                              print(f"Upload attempt {i+1}/{upload_attempts} failed: {upload_e}. Retrying...")
+                              time.sleep(1 * (i + 1)) # Simple backoff
+                         else:
+                              raise gr.Error(f"Failed to upload input file after multiple attempts: {upload_e}", duration=10)
+            except Exception as e:
+                 # This catches errors from the upload loop
+                 raise gr.Error(f"Fatal error during input file upload: {e}", duration=10)
+        else:
+             raise gr.Error("Gemini client or file upload capability not available.", duration=10)
+        # Construct the contents for the model input (image + text)
+        contents = [
+            types.Content(
+                role="user",
+                parts=[
+                    types.Part.from_uri(
+                        file_uri=uploaded_file.uri,
+                        mime_type=uploaded_file.mime_type,
+                    ),
+                    types.Part.from_text(text=text), # The combined text prompt
+                ],
+            ),
+        ]
+        # Configuration for generating content
+        generate_content_config = types.GenerateContentConfig(
+            temperature=1,
+            top_p=0.95,
+            top_k=40,
+            max_output_tokens=8192,
+            response_modalities=["image", "text"], # Crucial: Ask for BOTH image and text
+            response_mime_type="text/plain", # Still want text parts as plain text
+        )
+        text_response = ""
+        image_path = None # Store the path to the *first* generated image
+        print(f"\n--- Sending Request to Model '{model}' ---")
+        print(f"Prompt: {text}")
+        print(f"Input Image URI: {uploaded_file.uri}")
+        # Create a temporary file to save the generated image data
+        # This file needs to exist before streaming data into it.
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                 temp_generated_img_path = tmp.name
+            print(f"Temporary path created for generated image: {temp_generated_img_path}")
+            # Get the model instance
+            # Use the model name directly with get_model
+            model_instance = genai.get_model(model)
+            print("Model instance obtained.")
+            # Stream the response from the model
+            print("Starting response stream...")
+            # Use the model instance's generate_content_stream method
+            stream = model_instance.generate_content_stream(
+                contents=contents,
+                generation_config=generate_content_config, # Use generation_config
+            )
+            for chunk in stream:
+                # Check if the chunk and candidates are valid
+                if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts:
+                    # print("Skipping empty or invalid chunk.") # Optional: uncomment for verbose logging
+                    continue
+                # Process each part within the candidate
+                for part in chunk.candidates[0].content.parts:
+                    # Check for text parts
+                    text_part = getattr(part, "text", "")
+                    if text_part:
+                        # Append text - the model's response might come in multiple text parts
+                        text_response += text_part
+                        # print(f"Received text part: {text_part[:50]}...") # Optional: log partial text
+                    # Check for inline image data parts
+                    if hasattr(part, 'inline_data') and part.inline_data and part.inline_data.data:
+                        # Only save the *first* image data encountered during the stream
+                        if image_path is None:
+                            print(f"Received image data of mime type {part.inline_data.mime_type}")
+                            try:
+                                # Save the binary image data to our temporary file
+                                save_binary_file(temp_generated_img_path, part.inline_data.data)
+                                # Store the path to the saved file
+                                image_path = temp_generated_img_path
+                                print(f"Image data saved to: {image_path}")
+                                # IMPORTANT: DO NOT BREAK HERE. Continue processing the stream
+                                # to capture all text parts that might follow the image.
+                            except Exception as e:
+                                print(f"Error saving image data to {temp_generated_img_path}: {e}")
+                                # If saving fails, image_path remains None
+            print("Response stream complete.")
+            print(f"Final Image Path: {image_path}")
+            print(f"Accumulated Text Response Length: {len(text_response)}")
+        except Exception as e:
+            print(f"\nAn error occurred during content generation stream: {e}")
+            # Clean up the temporary generated image file if it was created but not yet assigned to image_path
+            if temp_generated_img_path and os.path.exists(temp_generated_img_path) and image_path is None:
+                 try:
+                      os.remove(temp_generated_img_path)
+                      print(f"Cleaned up temp generated file due to error: {temp_generated_img_path}")
+                 except Exception as ce:
+                      print(f"Error cleaning up temp generated file {temp_generated_img_path}: {ce}")
+            # Re-raise the exception
+            raise gr.Error(f"Gemini generation error: {e}", duration=10)
+    finally:
+         # Always delete the uploaded file from Google's service
+         if uploaded_file and client and hasattr(client, 'files'):
+             try:
+                 print(f"Deleting uploaded file: {uploaded_file.name}")
+                 client.files.delete(uploaded_file.name)
+                 print("Uploaded file deleted.")
+             except Exception as e:
+                 print(f"Error deleting uploaded file {uploaded_file.name}: {e}")
+         # Note: The temp_generated_img_path is cleaned up in process_image_and_prompt
+         # if it was successfully returned and processed. If an error occurs
+         # after temp_generated_img_path is created but before it's returned,
+         # the except block above handles cleanup.
+    # Return the path to the saved image and the accumulated text
     return image_path, text_response
+# Modified function to prepare input and handle output for Gradio
+def process_image_and_prompt(composite_pil: Image.Image, prompt: str):
+    """
+    Handles the Gradio input (PIL Image, prompt), prepares the model input,
+    calls the generate function, and formats the output for Gradio.
+    Constructs a combined prompt asking for both analysis and generation/edit.
+    """
+    composite_path = None # Path for the temporary input image file
+    temp_generated_image_path_returned = None # Path for the temporary generated image file returned by generate
     try:
+        # 1. Save the input PIL image to a temporary file that can be uploaded
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             composite_path = tmp.name
+            # Ensure image is RGB or RGBA before saving as PNG for compatibility
+            if composite_pil.mode not in ["RGB", "RGBA"]:
+                 composite_pil = composite_pil.convert("RGBA") # Use RGBA for potential transparency
             composite_pil.save(composite_path)
+        print(f"Input image saved to temporary path for upload: {composite_path}")
+        # 2. Construct the combined prompt for the model
+        # This prompt tells the model to FIRST describe/tag the image,
+        # and THEN perform the requested image task (edit/generation).
+        # The phrasing can influence the model's response format.
+        # Let's be explicit: Ask for description and tags first, then the main task.
+        combined_prompt = f"""
+Analyze the input image carefully.
+Provide a detailed description of the image, including key objects, actions, setting, and style.
+Then, provide a comma-separated list of relevant tags for the input image.
+Structure this analysis clearly, for example:
+Description: [Detailed description here]
+Tags: [tag1, tag2, tag3, ...]
+After the analysis, perform the following task based on the input image and these instructions:
+{prompt}
+"""
+        # You can adjust the formatting of the combined_prompt as needed.
+        # The goal is to clearly tell the model you want analysis text *first*
+        # or at least included in the text response, followed by the image task.
+        print(f"\n--- Combined Prompt Sent to Model ---")
+        print(combined_prompt)
+        # 3. Call the generate function with the combined prompt and the input image file
+        # generate will return the path to the generated image (if any) and the full text response from the stream
+        # This is where the single API request happens, processing input image+text and yielding output image+text.
+        temp_generated_image_path_returned, text_response = generate(text=combined_prompt, file_name=composite_path, model="gemini-2.0-flash-exp")
+        # 4. Process the results from the generate function
+        result_img = None
+        if temp_generated_image_path_returned and os.path.exists(temp_generated_image_path_returned):
+            try:
+                # Load the generated image file into a PIL Image object
+                result_img = Image.open(temp_generated_image_path_returned)
+                # Convert to RGB if it's RGBA for compatibility with Gradio's Gallery
+                # Gradio Gallery often expects RGB
+                if result_img.mode == "RGBA":
+                    result_img = result_img.convert("RGB")
+                print(f"\nGenerated image loaded successfully from {temp_generated_image_path_returned}.")
+            except Exception as img_e:
+                 print(f"\nError loading generated image from {temp_generated_image_path_returned}: {img_e}")
+                 # If loading fails, treat it as if no image was successfully generated
+                 result_img = None
         else:
+            print("\nNo valid generated image path returned or file not found after generation.")
+            # The model might fail to generate an image but still provide text
+        # 5. Prepare the output for Gradio
+        # Gradio's Gallery expects a list of images or None
+        output_gallery_content = [result_img] if result_img else None
+        # The text_response will contain the accumulated text from the model,
+        # which *should* now include the description/tags because we asked for them in the prompt,
+        # as well as any other textual output related to the edit/generation task.
+        print("\n--- Final Output Prepared for Gradio ---")
+        print("Image Generated Successfully:", result_img is not None)
+        print(f"Text Response Length: {len(text_response)}")
+        print("Text Response (showing first 500 chars):\n", text_response[:500] + ('...' if len(text_response) > 500 else ''))
+        return output_gallery_content, text_response
     except Exception as e:
+        # Exceptions from generate or above are caught here.
+        print(f"\nAn error occurred in process_image_and_prompt: {e}")
+        # Use gr.Error to display the error message nicely in the Gradio interface
+        raise gr.Error(f"Processing Error: {e}", duration=10)
+    finally:
+        # 6. Clean up temporary files regardless of success or failure
+        # Clean up the temporary input image file that was uploaded
+        if composite_path and os.path.exists(composite_path):
+             try:
+                  os.remove(composite_path)
+                  print(f"Removed temporary input file: {composite_path}")
+             except Exception as ce:
+                  print(f"Error removing input temp file {composite_path}: {ce}")
+        # Clean up the temporary generated image file *if it was created* and returned
+        # The path `temp_generated_image_path_returned` holds the path returned by generate.
+        if temp_generated_image_path_returned and os.path.exists(temp_generated_image_path_returned):
+             try:
+                 os.remove(temp_generated_image_path_returned)
+                 print(f"Removed temporary generated file: {temp_generated_image_path_returned}")
+             except Exception as ge:
+                 print(f"Error removing generated temp file {temp_generated_image_path_returned}: {ge}")
+# Gradio interface - Keep this section mostly the same
 with gr.Blocks( # css_paths="style.css", # Тимчасово закоментували цей рядок
     ) as demo:
     gr.HTML(
     """
     )
     with gr.Accordion("⚠️ API Configuration ⚠️", open=False, elem_classes="config-accordion"):
         gr.Markdown("""
+    - **Your Gemini API key must be stored in the environment variable `geminigoogle` in your Hugging Face Space settings (Settings -> Repository secrets).**
+    - ❗ Sometimes the model may return only text or encounter errors.
+    - The text output box below should contain the model's analysis of the *input image* (description and tags) followed by any commentary related to the edit/generation.
     """)
     with gr.Accordion("📌 Usage Instructions", open=False, elem_classes="instructions-accordion"):
         gr.Markdown("""
     ### 📌 Usage
+      - Upload an image and enter a prompt describing the *image edit or generation* you want.
+      - The model will analyze the input image and attempt to perform the edit/generation.
+      - The generated image will appear in the gallery (if successful).
+      - The text output will contain:
+          1. A description and tags of the **input image**.
+          2. Any commentary from the model about the edit/generation task.
+      - Upload Only PNG Image (recommended for transparent edits, but JPG often works)
       - ❌ **Do not use NSFW images!**
     """)
             image_input = gr.Image(
                 type="pil",
                 label="Upload Image",
+                image_mode="RGBA", # Use RGBA to handle transparency
                 elem_id="image-input",
                 elem_classes="upload-box"
             )
             prompt_input = gr.Textbox(
                 lines=2,
+                placeholder="Enter your image edit or generation prompt here (e.g., 'add a red hat', 'change background to a beach', 'make the eyes green').",
+                label="Image Task Prompt",
                 elem_classes="prompt-input"
             )
+            submit_btn = gr.Button("Generate & Analyze", elem_classes="generate-btn") # Button text reflects dual task
         with gr.Column(elem_classes="output-column"):
+            output_gallery = gr.Gallery(label="Generated Image Output", elem_classes="output-gallery", preview=True)
             output_text = gr.Textbox(
+                label="Gemini Text Output (Input Image Analysis + Edit Commentary)",
+                placeholder="Analysis of the input image (description, tags) and commentary on the image task will appear here.",
+                elem_classes="output-text",
+                lines=10, # Give more space for the text output
+                show_copy_button=True # Allow easy copying of the text
             )
+    # Set up the interaction
     submit_btn.click(
         fn=process_image_and_prompt,
+        inputs=[image_input, prompt_input],
         outputs=[output_gallery, output_text],
     )
     gr.Markdown("## Try these examples", elem_classes="gr-examples-header")
+    # Examples (adjust if necessary based on new prompt structure)
     examples = [
         ["data/1.webp", 'change text to "AMEER"'],
         ["data/2.webp", "remove the spoon from hand only"],
     gr.Examples(
         examples=examples,
+        inputs=[image_input, prompt_input],
         elem_id="examples-grid"
     )