File size: 6,865 Bytes
ffc2acd 71e69ad ffc2acd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import os
import requests
import gradio as gr
from PIL import Image
import io
class MultimodalImageCreator:
def __init__(self):
"""
Initialize the Multimodal Image Creator
Uses environment variables for API token
"""
# Retrieve API token from environment variable
self.hf_token = os.environ.get('HF_API_TOKEN')
if not self.hf_token:
raise ValueError(
"Hugging Face API token not found. "
"Set it in Spaces secrets or as an environment variable."
)
# Image Captioning API Endpoint
self.caption_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
# Text-to-Image API Endpoint
self.image_gen_api_url = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
# Common headers for API requests
self.headers = {
"Authorization": f"Bearer {self.hf_token}",
"Content-Type": "application/octet-stream"
}
def generate_caption(self, image_path):
"""
Generate a caption for the input image using Hugging Face API
Args:
image_path (str): Path to the input image
Returns:
str: Generated image caption
"""
try:
# Read the image file
with open(image_path, "rb") as f:
data = f.read()
# Make API request
response = requests.post(
self.caption_api_url,
headers=self.headers,
data=data
)
# Check response
if response.status_code == 200:
# Extract caption from response
caption = response.json()[0].get('generated_text', 'No caption generated')
return caption
else:
return f"Error: {response.status_code} - {response.text}"
except Exception as e:
return f"An error occurred: {str(e)}"
def generate_variations(self, caption, num_variations=3):
"""
Generate image variations based on the input caption
Args:
caption (str): Base caption to generate images from
num_variations (int): Number of image variations to generate
Returns:
list: Generated image variations
"""
generated_images = []
try:
for i in range(num_variations):
# Create a slightly varied prompt
varied_prompt = f"{caption}, artistic variation {i+1}, high quality"
# Make API request
response = requests.post(
self.image_gen_api_url,
headers={
"Authorization": f"Bearer {self.hf_token}",
"Content-Type": "application/json"
},
json={"inputs": varied_prompt}
)
# Check response
if response.status_code == 200:
# Convert response to PIL Image
image = Image.open(io.BytesIO(response.content))
generated_images.append(image)
else:
print(f"Error generating variation {i+1}: {response.status_code}")
return generated_images
except Exception as e:
print(f"An error occurred during image generation: {str(e)}")
return []
def create_gradio_interface():
"""
Create a Gradio interface for the Multimodal Image Creator
Returns:
gr.Blocks: Gradio interface
"""
# Initialize the multimodal image creator
creator = MultimodalImageCreator()
def process_image(input_image, num_variations):
try:
# Validate input
if input_image is None:
return None, "Please upload an image.", [], []
# Save the uploaded image temporarily
temp_image_path = "temp_input_image.jpg"
Image.fromarray(input_image).save(temp_image_path)
# Generate caption
original_caption = creator.generate_caption(temp_image_path)
# Create variations
generated_images = creator.generate_variations(
original_caption,
num_variations=num_variations
)
# Clean up temporary file
os.remove(temp_image_path)
# Generate variation captions
variation_captions = [
f"Variation based on: {original_caption}"
for _ in generated_images
]
return input_image, original_caption, generated_images, variation_captions
except Exception as e:
return None, f"An error occurred: {str(e)}", [], []
# Create Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Multimodal Image Content Creator")
gr.Markdown("Upload an image to generate a caption and create variations!")
with gr.Row():
# Input components
with gr.Column():
input_image = gr.Image(type="numpy", label="Upload Image")
num_variations = gr.Slider(
minimum=1,
maximum=5,
value=3,
step=1,
label="Number of Variations"
)
print(num_variations)
submit_btn = gr.Button("Generate Variations")
# Output components
with gr.Column():
# Original image and caption
original_image_output = gr.Image(label="Original Image")
original_caption = gr.Textbox(label="Generated Caption")
# Variations gallery
variations_gallery = gr.Gallery(label="Image Variations")
variations_captions = gr.Textbox(label="Variation Prompts")
# Set up the processing
submit_btn.click(
fn=process_image,
inputs=[input_image, num_variations],
outputs=[
original_image_output,
original_caption,
variations_gallery,
variations_captions
]
)
return demo
# Create and launch the Gradio interface
demo = create_gradio_interface()
# If running locally
if __name__ == "__main__":
demo.launch(
share=True,
debug=True
)
|