Spaces:

ziadmostafa
/

Multimodal-Image-Content-Creator

Sleeping

File size: 6,865 Bytes

import os
import requests
import gradio as gr
from PIL import Image
import io

class MultimodalImageCreator:
    def __init__(self):
        """
        Initialize the Multimodal Image Creator 
        Uses environment variables for API token
        """
        # Retrieve API token from environment variable
        self.hf_token = os.environ.get('HF_API_TOKEN')
        
        if not self.hf_token:
            raise ValueError(
                "Hugging Face API token not found. "
                "Set it in Spaces secrets or as an environment variable."
            )
        
        # Image Captioning API Endpoint
        self.caption_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
        
        # Text-to-Image API Endpoint
        self.image_gen_api_url = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
        
        # Common headers for API requests
        self.headers = {
            "Authorization": f"Bearer {self.hf_token}",
            "Content-Type": "application/octet-stream"
        }

    def generate_caption(self, image_path):
        """
        Generate a caption for the input image using Hugging Face API
        
        Args:
            image_path (str): Path to the input image
        
        Returns:
            str: Generated image caption
        """
        try:
            # Read the image file
            with open(image_path, "rb") as f:
                data = f.read()
            
            # Make API request
            response = requests.post(
                self.caption_api_url, 
                headers=self.headers, 
                data=data
            )
            
            # Check response
            if response.status_code == 200:
                # Extract caption from response
                caption = response.json()[0].get('generated_text', 'No caption generated')
                return caption
            else:
                return f"Error: {response.status_code} - {response.text}"
        
        except Exception as e:
            return f"An error occurred: {str(e)}"

    def generate_variations(self, caption, num_variations=3):
        """
        Generate image variations based on the input caption
        
        Args:
            caption (str): Base caption to generate images from
            num_variations (int): Number of image variations to generate
        
        Returns:
            list: Generated image variations
        """
        generated_images = []
        
        try:
            for i in range(num_variations):
                # Create a slightly varied prompt
                varied_prompt = f"{caption}, artistic variation {i+1}, high quality"
                
                # Make API request
                response = requests.post(
                    self.image_gen_api_url, 
                    headers={
                        "Authorization": f"Bearer {self.hf_token}",
                        "Content-Type": "application/json"
                    }, 
                    json={"inputs": varied_prompt}
                )
                
                # Check response
                if response.status_code == 200:
                    # Convert response to PIL Image
                    image = Image.open(io.BytesIO(response.content))
                    generated_images.append(image)
                else:
                    print(f"Error generating variation {i+1}: {response.status_code}")
            
            return generated_images
        
        except Exception as e:
            print(f"An error occurred during image generation: {str(e)}")
            return []

def create_gradio_interface():
    """
    Create a Gradio interface for the Multimodal Image Creator
    
    Returns:
        gr.Blocks: Gradio interface
    """
    # Initialize the multimodal image creator
    creator = MultimodalImageCreator()

    def process_image(input_image, num_variations):
        try:
            # Validate input
            if input_image is None:
                return None, "Please upload an image.", [], []
            
            # Save the uploaded image temporarily
            temp_image_path = "temp_input_image.jpg"
            Image.fromarray(input_image).save(temp_image_path)
            
            # Generate caption
            original_caption = creator.generate_caption(temp_image_path)
            
            # Create variations
            generated_images = creator.generate_variations(
                original_caption, 
                num_variations=num_variations
            )
            
            # Clean up temporary file
            os.remove(temp_image_path)
            
            # Generate variation captions
            variation_captions = [
                f"Variation based on: {original_caption}"
                for _ in generated_images
            ]
            
            return input_image, original_caption, generated_images, variation_captions
        
        except Exception as e:
            return None, f"An error occurred: {str(e)}", [], []

    # Create Gradio Interface
    with gr.Blocks() as demo:
        gr.Markdown("# Multimodal Image Content Creator")
        gr.Markdown("Upload an image to generate a caption and create variations!")
        
        with gr.Row():
            # Input components
            with gr.Column():
                input_image = gr.Image(type="numpy", label="Upload Image")
                num_variations = gr.Slider(
                    minimum=1, 
                    maximum=5, 
                    value=3, 
                    step=1, 
                    label="Number of Variations"
                )
                print(num_variations)
                submit_btn = gr.Button("Generate Variations")
            
            # Output components
            with gr.Column():
                # Original image and caption
                original_image_output = gr.Image(label="Original Image")
                original_caption = gr.Textbox(label="Generated Caption")
                
                # Variations gallery
                variations_gallery = gr.Gallery(label="Image Variations")
                variations_captions = gr.Textbox(label="Variation Prompts")
        
        # Set up the processing
        submit_btn.click(
            fn=process_image, 
            inputs=[input_image, num_variations],
            outputs=[
                original_image_output, 
                original_caption, 
                variations_gallery, 
                variations_captions
            ]
        )

    return demo

# Create and launch the Gradio interface
demo = create_gradio_interface()

# If running locally
if __name__ == "__main__":
    demo.launch(
        share=True,  
        debug=True   
    )