File size: 6,865 Bytes
ffc2acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71e69ad
ffc2acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import requests
import gradio as gr
from PIL import Image
import io

class MultimodalImageCreator:
    def __init__(self):
        """
        Initialize the Multimodal Image Creator 
        Uses environment variables for API token
        """
        # Retrieve API token from environment variable
        self.hf_token = os.environ.get('HF_API_TOKEN')
        
        if not self.hf_token:
            raise ValueError(
                "Hugging Face API token not found. "
                "Set it in Spaces secrets or as an environment variable."
            )
        
        # Image Captioning API Endpoint
        self.caption_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
        
        # Text-to-Image API Endpoint
        self.image_gen_api_url = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
        
        # Common headers for API requests
        self.headers = {
            "Authorization": f"Bearer {self.hf_token}",
            "Content-Type": "application/octet-stream"
        }

    def generate_caption(self, image_path):
        """
        Generate a caption for the input image using Hugging Face API
        
        Args:
            image_path (str): Path to the input image
        
        Returns:
            str: Generated image caption
        """
        try:
            # Read the image file
            with open(image_path, "rb") as f:
                data = f.read()
            
            # Make API request
            response = requests.post(
                self.caption_api_url, 
                headers=self.headers, 
                data=data
            )
            
            # Check response
            if response.status_code == 200:
                # Extract caption from response
                caption = response.json()[0].get('generated_text', 'No caption generated')
                return caption
            else:
                return f"Error: {response.status_code} - {response.text}"
        
        except Exception as e:
            return f"An error occurred: {str(e)}"

    def generate_variations(self, caption, num_variations=3):
        """
        Generate image variations based on the input caption
        
        Args:
            caption (str): Base caption to generate images from
            num_variations (int): Number of image variations to generate
        
        Returns:
            list: Generated image variations
        """
        generated_images = []
        
        try:
            for i in range(num_variations):
                # Create a slightly varied prompt
                varied_prompt = f"{caption}, artistic variation {i+1}, high quality"
                
                # Make API request
                response = requests.post(
                    self.image_gen_api_url, 
                    headers={
                        "Authorization": f"Bearer {self.hf_token}",
                        "Content-Type": "application/json"
                    }, 
                    json={"inputs": varied_prompt}
                )
                
                # Check response
                if response.status_code == 200:
                    # Convert response to PIL Image
                    image = Image.open(io.BytesIO(response.content))
                    generated_images.append(image)
                else:
                    print(f"Error generating variation {i+1}: {response.status_code}")
            
            return generated_images
        
        except Exception as e:
            print(f"An error occurred during image generation: {str(e)}")
            return []

def create_gradio_interface():
    """
    Create a Gradio interface for the Multimodal Image Creator
    
    Returns:
        gr.Blocks: Gradio interface
    """
    # Initialize the multimodal image creator
    creator = MultimodalImageCreator()

    def process_image(input_image, num_variations):
        try:
            # Validate input
            if input_image is None:
                return None, "Please upload an image.", [], []
            
            # Save the uploaded image temporarily
            temp_image_path = "temp_input_image.jpg"
            Image.fromarray(input_image).save(temp_image_path)
            
            # Generate caption
            original_caption = creator.generate_caption(temp_image_path)
            
            # Create variations
            generated_images = creator.generate_variations(
                original_caption, 
                num_variations=num_variations
            )
            
            # Clean up temporary file
            os.remove(temp_image_path)
            
            # Generate variation captions
            variation_captions = [
                f"Variation based on: {original_caption}"
                for _ in generated_images
            ]
            
            return input_image, original_caption, generated_images, variation_captions
        
        except Exception as e:
            return None, f"An error occurred: {str(e)}", [], []

    # Create Gradio Interface
    with gr.Blocks() as demo:
        gr.Markdown("# Multimodal Image Content Creator")
        gr.Markdown("Upload an image to generate a caption and create variations!")
        
        with gr.Row():
            # Input components
            with gr.Column():
                input_image = gr.Image(type="numpy", label="Upload Image")
                num_variations = gr.Slider(
                    minimum=1, 
                    maximum=5, 
                    value=3, 
                    step=1, 
                    label="Number of Variations"
                )
                print(num_variations)
                submit_btn = gr.Button("Generate Variations")
            
            # Output components
            with gr.Column():
                # Original image and caption
                original_image_output = gr.Image(label="Original Image")
                original_caption = gr.Textbox(label="Generated Caption")
                
                # Variations gallery
                variations_gallery = gr.Gallery(label="Image Variations")
                variations_captions = gr.Textbox(label="Variation Prompts")
        
        # Set up the processing
        submit_btn.click(
            fn=process_image, 
            inputs=[input_image, num_variations],
            outputs=[
                original_image_output, 
                original_caption, 
                variations_gallery, 
                variations_captions
            ]
        )

    return demo

# Create and launch the Gradio interface
demo = create_gradio_interface()

# If running locally
if __name__ == "__main__":
    demo.launch(
        share=True,  
        debug=True   
    )