import os import requests import gradio as gr from PIL import Image import io class MultimodalImageCreator: def __init__(self): """ Initialize the Multimodal Image Creator Uses environment variables for API token """ # Retrieve API token from environment variable self.hf_token = os.environ.get('HF_API_TOKEN') if not self.hf_token: raise ValueError( "Hugging Face API token not found. " "Set it in Spaces secrets or as an environment variable." ) # Image Captioning API Endpoint self.caption_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base" # Text-to-Image API Endpoint self.image_gen_api_url = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2" # Common headers for API requests self.headers = { "Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/octet-stream" } def generate_caption(self, image_path): """ Generate a caption for the input image using Hugging Face API Args: image_path (str): Path to the input image Returns: str: Generated image caption """ try: # Read the image file with open(image_path, "rb") as f: data = f.read() # Make API request response = requests.post( self.caption_api_url, headers=self.headers, data=data ) # Check response if response.status_code == 200: # Extract caption from response caption = response.json()[0].get('generated_text', 'No caption generated') return caption else: return f"Error: {response.status_code} - {response.text}" except Exception as e: return f"An error occurred: {str(e)}" def generate_variations(self, caption, num_variations=3): """ Generate image variations based on the input caption Args: caption (str): Base caption to generate images from num_variations (int): Number of image variations to generate Returns: list: Generated image variations """ generated_images = [] try: for i in range(num_variations): # Create a slightly varied prompt varied_prompt = f"{caption}, artistic variation {i+1}, high quality" # Make API request response = requests.post( self.image_gen_api_url, headers={ "Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json" }, json={"inputs": varied_prompt} ) # Check response if response.status_code == 200: # Convert response to PIL Image image = Image.open(io.BytesIO(response.content)) generated_images.append(image) else: print(f"Error generating variation {i+1}: {response.status_code}") return generated_images except Exception as e: print(f"An error occurred during image generation: {str(e)}") return [] def create_gradio_interface(): """ Create a Gradio interface for the Multimodal Image Creator Returns: gr.Blocks: Gradio interface """ # Initialize the multimodal image creator creator = MultimodalImageCreator() def process_image(input_image, num_variations): try: # Validate input if input_image is None: return None, "Please upload an image.", [], [] # Save the uploaded image temporarily temp_image_path = "temp_input_image.jpg" Image.fromarray(input_image).save(temp_image_path) # Generate caption original_caption = creator.generate_caption(temp_image_path) # Create variations generated_images = creator.generate_variations( original_caption, num_variations=num_variations ) # Clean up temporary file os.remove(temp_image_path) # Generate variation captions variation_captions = [ f"Variation based on: {original_caption}" for _ in generated_images ] return input_image, original_caption, generated_images, variation_captions except Exception as e: return None, f"An error occurred: {str(e)}", [], [] # Create Gradio Interface with gr.Blocks() as demo: gr.Markdown("# Multimodal Image Content Creator") gr.Markdown("Upload an image to generate a caption and create variations!") with gr.Row(): # Input components with gr.Column(): input_image = gr.Image(type="numpy", label="Upload Image") num_variations = gr.Slider( minimum=1, maximum=5, value=3, step=1, label="Number of Variations" ) print(num_variations) submit_btn = gr.Button("Generate Variations") # Output components with gr.Column(): # Original image and caption original_image_output = gr.Image(label="Original Image") original_caption = gr.Textbox(label="Generated Caption") # Variations gallery variations_gallery = gr.Gallery(label="Image Variations") variations_captions = gr.Textbox(label="Variation Prompts") # Set up the processing submit_btn.click( fn=process_image, inputs=[input_image, num_variations], outputs=[ original_image_output, original_caption, variations_gallery, variations_captions ] ) return demo # Create and launch the Gradio interface demo = create_gradio_interface() # If running locally if __name__ == "__main__": demo.launch( share=True, debug=True )