|
|
import os |
|
|
import requests |
|
|
import gradio as gr |
|
|
from PIL import Image |
|
|
import io |
|
|
|
|
|
class MultimodalImageCreator: |
|
|
def __init__(self): |
|
|
""" |
|
|
Initialize the Multimodal Image Creator |
|
|
Uses environment variables for API token |
|
|
""" |
|
|
|
|
|
self.hf_token = os.environ.get('HF_API_TOKEN') |
|
|
|
|
|
if not self.hf_token: |
|
|
raise ValueError( |
|
|
"Hugging Face API token not found. " |
|
|
"Set it in Spaces secrets or as an environment variable." |
|
|
) |
|
|
|
|
|
|
|
|
self.caption_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base" |
|
|
|
|
|
|
|
|
self.image_gen_api_url = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2" |
|
|
|
|
|
|
|
|
self.headers = { |
|
|
"Authorization": f"Bearer {self.hf_token}", |
|
|
"Content-Type": "application/octet-stream" |
|
|
} |
|
|
|
|
|
def generate_caption(self, image_path): |
|
|
""" |
|
|
Generate a caption for the input image using Hugging Face API |
|
|
|
|
|
Args: |
|
|
image_path (str): Path to the input image |
|
|
|
|
|
Returns: |
|
|
str: Generated image caption |
|
|
""" |
|
|
try: |
|
|
|
|
|
with open(image_path, "rb") as f: |
|
|
data = f.read() |
|
|
|
|
|
|
|
|
response = requests.post( |
|
|
self.caption_api_url, |
|
|
headers=self.headers, |
|
|
data=data |
|
|
) |
|
|
|
|
|
|
|
|
if response.status_code == 200: |
|
|
|
|
|
caption = response.json()[0].get('generated_text', 'No caption generated') |
|
|
return caption |
|
|
else: |
|
|
return f"Error: {response.status_code} - {response.text}" |
|
|
|
|
|
except Exception as e: |
|
|
return f"An error occurred: {str(e)}" |
|
|
|
|
|
def generate_variations(self, caption, num_variations=3): |
|
|
""" |
|
|
Generate image variations based on the input caption |
|
|
|
|
|
Args: |
|
|
caption (str): Base caption to generate images from |
|
|
num_variations (int): Number of image variations to generate |
|
|
|
|
|
Returns: |
|
|
list: Generated image variations |
|
|
""" |
|
|
generated_images = [] |
|
|
|
|
|
try: |
|
|
for i in range(num_variations): |
|
|
|
|
|
varied_prompt = f"{caption}, artistic variation {i+1}, high quality" |
|
|
|
|
|
|
|
|
response = requests.post( |
|
|
self.image_gen_api_url, |
|
|
headers={ |
|
|
"Authorization": f"Bearer {self.hf_token}", |
|
|
"Content-Type": "application/json" |
|
|
}, |
|
|
json={"inputs": varied_prompt} |
|
|
) |
|
|
|
|
|
|
|
|
if response.status_code == 200: |
|
|
|
|
|
image = Image.open(io.BytesIO(response.content)) |
|
|
generated_images.append(image) |
|
|
else: |
|
|
print(f"Error generating variation {i+1}: {response.status_code}") |
|
|
|
|
|
return generated_images |
|
|
|
|
|
except Exception as e: |
|
|
print(f"An error occurred during image generation: {str(e)}") |
|
|
return [] |
|
|
|
|
|
def create_gradio_interface(): |
|
|
""" |
|
|
Create a Gradio interface for the Multimodal Image Creator |
|
|
|
|
|
Returns: |
|
|
gr.Blocks: Gradio interface |
|
|
""" |
|
|
|
|
|
creator = MultimodalImageCreator() |
|
|
|
|
|
def process_image(input_image, num_variations): |
|
|
try: |
|
|
|
|
|
if input_image is None: |
|
|
return None, "Please upload an image.", [], [] |
|
|
|
|
|
|
|
|
temp_image_path = "temp_input_image.jpg" |
|
|
Image.fromarray(input_image).save(temp_image_path) |
|
|
|
|
|
|
|
|
original_caption = creator.generate_caption(temp_image_path) |
|
|
|
|
|
|
|
|
generated_images = creator.generate_variations( |
|
|
original_caption, |
|
|
num_variations=num_variations |
|
|
) |
|
|
|
|
|
|
|
|
os.remove(temp_image_path) |
|
|
|
|
|
|
|
|
variation_captions = [ |
|
|
f"Variation based on: {original_caption}" |
|
|
for _ in generated_images |
|
|
] |
|
|
|
|
|
return input_image, original_caption, generated_images, variation_captions |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"An error occurred: {str(e)}", [], [] |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# Multimodal Image Content Creator") |
|
|
gr.Markdown("Upload an image to generate a caption and create variations!") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(): |
|
|
input_image = gr.Image(type="numpy", label="Upload Image") |
|
|
num_variations = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=5, |
|
|
value=3, |
|
|
step=1, |
|
|
label="Number of Variations" |
|
|
) |
|
|
print(num_variations) |
|
|
submit_btn = gr.Button("Generate Variations") |
|
|
|
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
original_image_output = gr.Image(label="Original Image") |
|
|
original_caption = gr.Textbox(label="Generated Caption") |
|
|
|
|
|
|
|
|
variations_gallery = gr.Gallery(label="Image Variations") |
|
|
variations_captions = gr.Textbox(label="Variation Prompts") |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_image, |
|
|
inputs=[input_image, num_variations], |
|
|
outputs=[ |
|
|
original_image_output, |
|
|
original_caption, |
|
|
variations_gallery, |
|
|
variations_captions |
|
|
] |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
demo = create_gradio_interface() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
share=True, |
|
|
debug=True |
|
|
) |
|
|
|
|
|
|