Spaces:

ziadmostafa
/

Multimodal-Image-Content-Creator

Sleeping

App Files Files Community

ziadmostafa commited on Dec 11, 2024

Commit

ffc2acd

1 Parent(s): 30fa863

first commit

Browse files

Files changed (2) hide show

app.py +203 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import requests
+import gradio as gr
+from PIL import Image
+import io
+class MultimodalImageCreator:
+    def __init__(self):
+        """
+        Initialize the Multimodal Image Creator
+        Uses environment variables for API token
+        """
+        # Retrieve API token from environment variable
+        self.hf_token = os.environ.get('HF_API_TOKEN')
+        if not self.hf_token:
+            raise ValueError(
+                "Hugging Face API token not found. "
+                "Set it in Spaces secrets or as an environment variable."
+            )
+        # Image Captioning API Endpoint
+        self.caption_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
+        # Text-to-Image API Endpoint
+        self.image_gen_api_url = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
+        # Common headers for API requests
+        self.headers = {
+            "Authorization": f"Bearer {self.hf_token}",
+            "Content-Type": "application/octet-stream"
+        }
+    def generate_caption(self, image_path):
+        """
+        Generate a caption for the input image using Hugging Face API
+        Args:
+            image_path (str): Path to the input image
+        Returns:
+            str: Generated image caption
+        """
+        try:
+            # Read the image file
+            with open(image_path, "rb") as f:
+                data = f.read()
+            # Make API request
+            response = requests.post(
+                self.caption_api_url,
+                headers=self.headers,
+                data=data
+            )
+            # Check response
+            if response.status_code == 200:
+                # Extract caption from response
+                caption = response.json()[0].get('generated_text', 'No caption generated')
+                return caption
+            else:
+                return f"Error: {response.status_code} - {response.text}"
+        except Exception as e:
+            return f"An error occurred: {str(e)}"
+    def generate_variations(self, caption, num_variations=3):
+        """
+        Generate image variations based on the input caption
+        Args:
+            caption (str): Base caption to generate images from
+            num_variations (int): Number of image variations to generate
+        Returns:
+            list: Generated image variations
+        """
+        generated_images = []
+        try:
+            for i in range(num_variations):
+                # Create a slightly varied prompt
+                varied_prompt = f"{caption}, artistic variation {i+1}, high quality"
+                # Make API request
+                response = requests.post(
+                    self.image_gen_api_url,
+                    headers={
+                        "Authorization": f"Bearer {self.hf_token}",
+                        "Content-Type": "application/json"
+                    },
+                    json={"inputs": varied_prompt}
+                )
+                # Check response
+                if response.status_code == 200:
+                    # Convert response to PIL Image
+                    image = Image.open(io.BytesIO(response.content))
+                    generated_images.append(image)
+                else:
+                    print(f"Error generating variation {i+1}: {response.status_code}")
+            return generated_images
+        except Exception as e:
+            print(f"An error occurred during image generation: {str(e)}")
+            return []
+def create_gradio_interface():
+    """
+    Create a Gradio interface for the Multimodal Image Creator
+    Returns:
+        gr.Blocks: Gradio interface
+    """
+    # Initialize the multimodal image creator
+    creator = MultimodalImageCreator()
+    def process_image(input_image, num_variations):
+        try:
+            # Validate input
+            if input_image is None:
+                return None, "Please upload an image.", [], []
+            # Save the uploaded image temporarily
+            temp_image_path = "temp_input_image.jpg"
+            Image.fromarray(input_image).save(temp_image_path)
+            # Generate caption
+            original_caption = creator.generate_caption(temp_image_path)
+            # Create variations
+            generated_images = creator.generate_variations(
+                original_caption,
+                num_variations=num_variations
+            )
+            # Clean up temporary file
+            os.remove(temp_image_path)
+            # Generate variation captions
+            variation_captions = [
+                f"Variation based on: {original_caption}"
+                for _ in generated_images
+            ]
+            return input_image, original_caption, generated_images, variation_captions
+        except Exception as e:
+            return None, f"An error occurred: {str(e)}", [], []
+    # Create Gradio Interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# Multimodal Image Content Creator")
+        gr.Markdown("Upload an image to generate a caption and create variations!")
+        with gr.Row():
+            # Input components
+            with gr.Column():
+                input_image = gr.Image(type="numpy", label="Upload Image")
+                num_variations = gr.Slider(
+                    minimum=1,
+                    maximum=5,
+                    value=3,
+                    step=1,
+                    label="Number of Variations"
+                )
+                submit_btn = gr.Button("Generate Variations")
+            # Output components
+            with gr.Column():
+                # Original image and caption
+                original_image_output = gr.Image(label="Original Image")
+                original_caption = gr.Textbox(label="Generated Caption")
+                # Variations gallery
+                variations_gallery = gr.Gallery(label="Image Variations")
+                variations_captions = gr.Textbox(label="Variation Prompts")
+        # Set up the processing
+        submit_btn.click(
+            fn=process_image,
+            inputs=[input_image, num_variations],
+            outputs=[
+                original_image_output,
+                original_caption,
+                variations_gallery,
+                variations_captions
+            ]
+        )
+    return demo
+# Create and launch the Gradio interface
+demo = create_gradio_interface()
+# If running locally
+if __name__ == "__main__":
+    demo.launch(
+        share=True,
+        debug=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+requests
+gradio
+pillow