gowshiselva commited on
Commit
fdc7fa3
·
verified ·
1 Parent(s): 2e6a234

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -121
app.py CHANGED
@@ -1,143 +1,303 @@
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
- from transformers import BlipProcessor, BlipForConditionalGeneration
5
- import re
 
6
 
7
- # Initial setup
8
- print("Loading models...")
 
9
 
10
- # Use a single, more reliable model for comprehensive descriptions
11
- blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
12
- blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Move model to GPU if available
15
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
- blip_model.to(device)
 
17
 
18
- print(f"Model loaded. Using device: {device}")
 
19
 
20
- def generate_caption(image, prompt):
21
- """Generate a caption based on image and text prompt"""
22
- inputs = blip_processor(image, prompt, return_tensors="pt").to(device)
23
- outputs = blip_model.generate(**inputs, max_new_tokens=100)
24
- return blip_processor.decode(outputs[0], skip_special_tokens=True)
25
 
26
- def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
27
- """
28
- Generate an advanced description using multiple targeted prompts
29
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  if image is None:
31
  return "Please upload an image to generate a description."
32
 
33
  try:
34
- with torch.no_grad():
35
- # Generate multiple aspects of the description using different targeted prompts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
 
 
 
37
  # 1. Basic caption
38
- basic_caption = generate_caption(image, "a detailed caption of this image:")
39
 
40
- # 2. Subject description
41
- subject_prompt = "Describe the main subjects in this image with details about their appearance:"
42
- subject_desc = generate_caption(image, subject_prompt)
 
 
 
 
 
43
 
44
  # 3. Setting/background
45
- setting_prompt = "Describe the setting, location, and background of this image:"
46
- setting_desc = generate_caption(image, setting_prompt)
 
 
47
 
48
  # 4. Colors and visual elements
49
  if style_focus >= 3:
50
  color_prompt = "Describe the color scheme, visual composition, and artistic style of this image:"
51
- color_desc = generate_caption(image, color_prompt)
52
- else:
53
- color_desc = ""
54
 
55
  # 5. Emotion and mood
56
  if emotion_focus >= 3:
57
  emotion_prompt = "Describe the mood, emotional tone, and atmosphere conveyed in this image:"
58
- emotion_desc = generate_caption(image, emotion_prompt)
59
- else:
60
- emotion_desc = ""
61
 
62
  # 6. Lighting and time
63
- lighting_prompt = "Describe the lighting conditions and time of day in this image:"
64
- lighting_desc = generate_caption(image, lighting_prompt)
 
 
65
 
66
  # 7. Details and textures (only for high detail levels)
67
- if detail_level >= 4:
68
  detail_prompt = "Describe the fine details, textures, and small elements visible in this image:"
69
- detail_desc = generate_caption(image, detail_prompt)
70
- else:
71
- detail_desc = ""
72
 
73
- # Clean up responses (sometimes the model repeats the prompt)
74
- def clean_response(response, prompt):
75
- # Remove the prompt if it appears at the beginning
76
- if response.startswith(prompt):
77
- response = response[len(prompt):].strip()
78
- return response
79
-
80
- subject_desc = clean_response(subject_desc, subject_prompt)
81
- setting_desc = clean_response(setting_desc, setting_prompt)
82
- if style_focus >= 3:
83
- color_desc = clean_response(color_desc, color_prompt)
84
- if emotion_focus >= 3:
85
- emotion_desc = clean_response(emotion_desc, emotion_prompt)
86
- lighting_desc = clean_response(lighting_desc, lighting_prompt)
87
- if detail_level >= 4:
88
- detail_desc = clean_response(detail_desc, detail_prompt)
89
-
90
- # Format results for AI image generation
91
  formatted_result = ""
92
 
93
  # Add basic subject identification
94
  formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
95
 
96
- # Add comprehensive description section
97
- formatted_result += f"## Detailed Description for AI Image Recreation:\n\n"
98
-
99
- formatted_result += f"**Main Subject(s):** {subject_desc}\n\n"
100
- formatted_result += f"**Setting/Background:** {setting_desc}\n\n"
101
- formatted_result += f"**Lighting/Atmosphere:** {lighting_desc}\n\n"
102
-
103
- if style_focus >= 3:
104
- formatted_result += f"**Visual Style/Colors:** {color_desc}\n\n"
105
-
106
- if emotion_focus >= 3:
107
- formatted_result += f"**Mood/Emotional Tone:** {emotion_desc}\n\n"
108
-
109
- if detail_level >= 4:
110
- formatted_result += f"**Fine Details/Textures:** {detail_desc}\n\n"
111
-
112
- # Additional section for AI generation prompts
113
- descriptions = [basic_caption.strip(".")]
114
- if len(subject_desc) > 10:
115
- descriptions.append(subject_desc.split(".")[0])
116
- if len(setting_desc) > 10:
117
- descriptions.append(setting_desc.split(".")[0])
118
- if style_focus >= 3 and len(color_desc) > 10:
119
- descriptions.append(color_desc.split(".")[0])
120
-
121
- # Create specific prompt for AI image generation
122
- formatted_result += "## Suggested AI Image Generation Prompt:\n\n"
123
- ai_prompt = ", ".join(descriptions)
124
-
125
- # Add qualifiers based on settings
126
- qualifiers = []
127
- if detail_level >= 4:
128
- qualifiers.append("highly detailed")
129
- qualifiers.append("intricate")
130
- if emotion_focus >= 4:
131
- qualifiers.append("emotional")
132
- qualifiers.append("evocative")
133
- if style_focus >= 4:
134
- qualifiers.append("artistic composition")
135
- qualifiers.append("professional photography")
136
-
137
- if qualifiers:
138
- ai_prompt += ", " + ", ".join(qualifiers)
139
 
140
- formatted_result += ai_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  return formatted_result
143
 
@@ -145,41 +305,92 @@ def generate_advanced_description(image, detail_level, emotion_focus, style_focu
145
  return f"Error generating description: {str(e)}"
146
 
147
  # Create Gradio interface
148
- with gr.Blocks(title="Advanced Image Description Generator") as demo:
149
- gr.Markdown("# Advanced Image Description Generator for AI Recreation")
150
- gr.Markdown("Upload an image to generate detailed descriptions that help AI image generators recreate similar images.")
151
 
152
  with gr.Row():
153
  with gr.Column(scale=1):
154
  input_image = gr.Image(label="Upload Image", type="pil")
155
- with gr.Row():
156
- detail_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Detail Level")
157
- emotion_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Emotion Focus")
158
- style_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Style/Artistic Focus")
159
- submit_btn = gr.Button("Generate Description")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  with gr.Column(scale=1):
162
- output_text = gr.Textbox(label="Image Description", lines=20)
163
 
164
- submit_btn.click(
 
165
  fn=generate_advanced_description,
166
- inputs=[input_image, detail_slider, emotion_slider, style_slider],
 
 
 
 
 
 
 
 
 
167
  outputs=output_text
168
  )
169
 
170
  gr.Markdown("""
171
  ## How to Use
172
  1. Upload an image
173
- 2. Adjust the sliders to control description detail:
174
- - Detail Level: How comprehensive the description should be
175
- - Emotion Focus: Emphasis on mood and feelings
176
- - Style Focus: Emphasis on artistic elements
177
- 3. Click "Generate Description"
178
- 4. Use the generated text to prompt AI image generators
 
 
179
 
180
  ## About
181
- This app analyzes images and generates detailed descriptions suitable for recreating
182
- similar images with AI image generators like Stable Diffusion, Midjourney, or DALL-E.
 
183
  """)
184
 
185
  # Launch the app
 
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
+ import torch.nn as nn
5
+ import torchvision.transforms.functional as TVF
6
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoModelForConditionalGeneration, PreTrainedTokenizer, PreTrainedTokenizerFast
7
 
8
+ # Define constants
9
+ TITLE = "<h1><center>Enhanced Image Captioning Studio</center></h1>"
10
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
 
12
+ # Pre-defined caption types with templates
13
+ CAPTION_TYPE_MAP = {
14
+ "Descriptive": [
15
+ "Write a descriptive caption for this image in a formal tone.",
16
+ "Write a descriptive caption for this image in a formal tone within {word_count} words.",
17
+ "Write a {length} descriptive caption for this image in a formal tone.",
18
+ ],
19
+ "Descriptive (Informal)": [
20
+ "Write a descriptive caption for this image in a casual tone.",
21
+ "Write a descriptive caption for this image in a casual tone within {word_count} words.",
22
+ "Write a {length} descriptive caption for this image in a casual tone.",
23
+ ],
24
+ "AI Generation Prompt": [
25
+ "Write a detailed prompt for AI image generation based on this image.",
26
+ "Write a detailed prompt for AI image generation based on this image within {word_count} words.",
27
+ "Write a {length} prompt for AI image generation based on this image.",
28
+ ],
29
+ "MidJourney": [
30
+ "Write a MidJourney prompt for this image.",
31
+ "Write a MidJourney prompt for this image within {word_count} words.",
32
+ "Write a {length} MidJourney prompt for this image.",
33
+ ],
34
+ "Stable Diffusion": [
35
+ "Write a Stable Diffusion prompt for this image.",
36
+ "Write a Stable Diffusion prompt for this image within {word_count} words.",
37
+ "Write a {length} Stable Diffusion prompt for this image.",
38
+ ],
39
+ "Art Critic": [
40
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
41
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
42
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
43
+ ],
44
+ "Product Listing": [
45
+ "Write a caption for this image as though it were a product listing.",
46
+ "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
47
+ "Write a {length} caption for this image as though it were a product listing.",
48
+ ],
49
+ "Social Media Post": [
50
+ "Write a caption for this image as if it were being used for a social media post.",
51
+ "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
52
+ "Write a {length} caption for this image as if it were being used for a social media post.",
53
+ ],
54
+ "Tag List": [
55
+ "Write a list of tags for this image.",
56
+ "Write a list of tags for this image within {word_count} words.",
57
+ "Write a {length} list of tags for this image.",
58
+ ],
59
+ "Technical Analysis": [
60
+ "Provide a technical analysis of this image including camera details, lighting, composition, and quality.",
61
+ "Provide a technical analysis of this image including camera details, lighting, composition, and quality within {word_count} words.",
62
+ "Provide a {length} technical analysis of this image including camera details, lighting, composition, and quality.",
63
+ ],
64
+ }
65
 
66
+ class ImageAdapter(nn.Module):
67
+ def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
68
+ super().__init__()
69
+ self.deep_extract = deep_extract
70
 
71
+ if self.deep_extract:
72
+ input_features = input_features * 5
73
 
74
+ self.linear1 = nn.Linear(input_features, output_features)
75
+ self.activation = nn.GELU()
76
+ self.linear2 = nn.Linear(output_features, output_features)
77
+ self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
78
+ self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
79
 
80
+ # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>)
81
+ self.other_tokens = nn.Embedding(3, output_features)
82
+ self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)
83
+
84
+ def forward(self, vision_outputs: torch.Tensor):
85
+ if self.deep_extract:
86
+ x = torch.concat((
87
+ vision_outputs[-2],
88
+ vision_outputs[3],
89
+ vision_outputs[7],
90
+ vision_outputs[13],
91
+ vision_outputs[20],
92
+ ), dim=-1)
93
+ assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
94
+ assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
95
+ else:
96
+ x = vision_outputs[-2]
97
+
98
+ x = self.ln1(x)
99
+
100
+ if self.pos_emb is not None:
101
+ assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
102
+ x = x + self.pos_emb
103
+
104
+ x = self.linear1(x)
105
+ x = self.activation(x)
106
+ x = self.linear2(x)
107
+
108
+ # <|image_start|>, IMAGE, <|image_end|>
109
+ other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
110
+ assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
111
+ x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
112
+
113
+ return x
114
+
115
+ def get_eot_embedding(self):
116
+ return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
117
+
118
+ # Model loading functions
119
+ def load_siglip_model():
120
+ print("Loading SigLIP model...")
121
+ model_path = "google/siglip-so400m-patch14-384"
122
+ processor = AutoProcessor.from_pretrained(model_path)
123
+ model = AutoModel.from_pretrained(model_path)
124
+ model = model.vision_model
125
+ model.eval()
126
+ model.requires_grad_(False)
127
+ model.to(DEVICE)
128
+ return model, processor
129
+
130
+ def load_blip_model():
131
+ print("Loading BLIP model...")
132
+ processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
133
+ model = AutoModelForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
134
+ model.to(DEVICE)
135
+ model.eval()
136
+ return model, processor
137
+
138
+ # Initialize models (with optional lazy loading)
139
+ class ModelManager:
140
+ def __init__(self):
141
+ self.blip_model = None
142
+ self.blip_processor = None
143
+ self.siglip_model = None
144
+ self.siglip_processor = None
145
+ self.image_adapter = None
146
+ self.llm_model = None
147
+ self.tokenizer = None
148
+ self.models_loaded = False
149
+
150
+ def load_models(self):
151
+ if not self.models_loaded:
152
+ # Load BLIP model for basic captioning
153
+ self.blip_model, self.blip_processor = load_blip_model()
154
+
155
+ # For more advanced captioning, set up paths to load custom models
156
+ # In a real implementation, you would load the full pipeline with proper paths
157
+ # For now, we'll use BLIP for both simple and advanced operations
158
+
159
+ self.models_loaded = True
160
+ return self
161
+
162
+ model_manager = ModelManager()
163
+
164
+ def generate_basic_caption(image, prompt="a detailed caption of this image:"):
165
+ """Generate a basic caption using BLIP model"""
166
+ model_manager.load_models()
167
+
168
+ inputs = model_manager.blip_processor(image, prompt, return_tensors="pt").to(DEVICE)
169
+ with torch.no_grad():
170
+ outputs = model_manager.blip_model.generate(**inputs, max_new_tokens=100)
171
+
172
+ return model_manager.blip_processor.decode(outputs[0], skip_special_tokens=True)
173
+
174
+ def generate_advanced_description(image, caption_type, caption_length, detail_level, emotion_focus, style_focus, extra_options, custom_prompt):
175
+ """Generate an advanced description using multiple targeted prompts"""
176
  if image is None:
177
  return "Please upload an image to generate a description."
178
 
179
  try:
180
+ # Load models if not already loaded
181
+ model_manager.load_models()
182
+
183
+ # Process caption parameters
184
+ length = None if caption_length == "any" else caption_length
185
+
186
+ if isinstance(length, str):
187
+ try:
188
+ length = int(length)
189
+ except ValueError:
190
+ pass
191
+
192
+ # Build prompt based on caption type and parameters
193
+ if length is None:
194
+ map_idx = 0
195
+ elif isinstance(length, int):
196
+ map_idx = 1
197
+ else:
198
+ map_idx = 2
199
+
200
+ prompt_str = CAPTION_TYPE_MAP.get(caption_type, CAPTION_TYPE_MAP["Descriptive"])[map_idx]
201
+
202
+ # Add extra options
203
+ if extra_options:
204
+ prompt_str += " " + " ".join(extra_options)
205
+
206
+ # Replace placeholders in the prompt
207
+ prompt_str = prompt_str.format(length=caption_length, word_count=caption_length)
208
+
209
+ # Override with custom prompt if provided
210
+ if custom_prompt and custom_prompt.strip():
211
+ prompt_str = custom_prompt.strip()
212
 
213
+ print(f"Using prompt: {prompt_str}")
214
+
215
+ # Generate captions with different aspects based on detail level
216
+ with torch.no_grad():
217
  # 1. Basic caption
218
+ basic_caption = generate_basic_caption(image, prompt_str)
219
 
220
+ descriptions = []
221
+ descriptions.append(("Basic Caption", basic_caption))
222
+
223
+ # 2. Subject description (if detail level is high enough)
224
+ if detail_level >= 2:
225
+ subject_prompt = "Describe the main subjects in this image with details about their appearance:"
226
+ subject_desc = generate_basic_caption(image, subject_prompt)
227
+ descriptions.append(("Main Subject(s)", subject_desc))
228
 
229
  # 3. Setting/background
230
+ if detail_level >= 3:
231
+ setting_prompt = "Describe the setting, location, and background of this image:"
232
+ setting_desc = generate_basic_caption(image, setting_prompt)
233
+ descriptions.append(("Setting/Background", setting_desc))
234
 
235
  # 4. Colors and visual elements
236
  if style_focus >= 3:
237
  color_prompt = "Describe the color scheme, visual composition, and artistic style of this image:"
238
+ color_desc = generate_basic_caption(image, color_prompt)
239
+ descriptions.append(("Visual Style/Colors", color_desc))
 
240
 
241
  # 5. Emotion and mood
242
  if emotion_focus >= 3:
243
  emotion_prompt = "Describe the mood, emotional tone, and atmosphere conveyed in this image:"
244
+ emotion_desc = generate_basic_caption(image, emotion_prompt)
245
+ descriptions.append(("Mood/Emotional Tone", emotion_desc))
 
246
 
247
  # 6. Lighting and time
248
+ if detail_level >= 4 or style_focus >= 4:
249
+ lighting_prompt = "Describe the lighting conditions and time of day in this image:"
250
+ lighting_desc = generate_basic_caption(image, lighting_prompt)
251
+ descriptions.append(("Lighting/Atmosphere", lighting_desc))
252
 
253
  # 7. Details and textures (only for high detail levels)
254
+ if detail_level >= 5:
255
  detail_prompt = "Describe the fine details, textures, and small elements visible in this image:"
256
+ detail_desc = generate_basic_caption(image, detail_prompt)
257
+ descriptions.append(("Fine Details/Textures", detail_desc))
 
258
 
259
+ # Format results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  formatted_result = ""
261
 
262
  # Add basic subject identification
263
  formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
264
 
265
+ # Add comprehensive description section if more detailed
266
+ if detail_level >= 2:
267
+ formatted_result += f"## Detailed Description:\n\n"
268
+
269
+ for title, desc in descriptions[1:]: # Skip the basic caption
270
+ formatted_result += f"**{title}:** {desc}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
+ # Additional section for AI generation prompts if requested
273
+ if caption_type in ["AI Generation Prompt", "MidJourney", "Stable Diffusion"]:
274
+ # Create a condensed version for AI generation
275
+ ai_descriptions = [basic_caption.strip(".")]
276
+
277
+ for _, desc in descriptions[1:]:
278
+ if len(desc) > 10:
279
+ ai_descriptions.append(desc.split(".")[0])
280
+
281
+ # Create specific prompt for AI image generation
282
+ formatted_result += "## Suggested AI Image Generation Prompt:\n\n"
283
+ ai_prompt = ", ".join(ai_descriptions)
284
+
285
+ # Add qualifiers based on settings
286
+ qualifiers = []
287
+ if detail_level >= 4:
288
+ qualifiers.append("highly detailed")
289
+ qualifiers.append("intricate")
290
+ if emotion_focus >= 4:
291
+ qualifiers.append("emotional")
292
+ qualifiers.append("evocative")
293
+ if style_focus >= 4:
294
+ qualifiers.append("artistic composition")
295
+ qualifiers.append("professional photography")
296
+
297
+ if qualifiers:
298
+ ai_prompt += ", " + ", ".join(qualifiers)
299
+
300
+ formatted_result += ai_prompt
301
 
302
  return formatted_result
303
 
 
305
  return f"Error generating description: {str(e)}"
306
 
307
  # Create Gradio interface
308
+ with gr.Blocks(title="Enhanced Image Captioning Studio") as demo:
309
+ gr.HTML(TITLE)
310
+ gr.Markdown("Upload an image to generate detailed captions and descriptions tailored to your needs.")
311
 
312
  with gr.Row():
313
  with gr.Column(scale=1):
314
  input_image = gr.Image(label="Upload Image", type="pil")
315
+
316
+ caption_type = gr.Dropdown(
317
+ choices=list(CAPTION_TYPE_MAP.keys()),
318
+ label="Caption Type",
319
+ value="Descriptive",
320
+ )
321
+
322
+ caption_length = gr.Dropdown(
323
+ choices=["any", "very short", "short", "medium-length", "long", "very long"] +
324
+ [str(i) for i in range(20, 301, 20)],
325
+ label="Caption Length",
326
+ value="medium-length",
327
+ )
328
+
329
+ with gr.Accordion("Advanced Settings", open=False):
330
+ with gr.Row():
331
+ detail_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Detail Level")
332
+ emotion_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Emotion Focus")
333
+ style_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Style/Artistic Focus")
334
+
335
+ extra_options = gr.CheckboxGroup(
336
+ choices=[
337
+ "Include information about lighting.",
338
+ "Include information about camera angle.",
339
+ "Include information about whether there is a watermark or not.",
340
+ "Include information about any artifacts or quality issues.",
341
+ "If it is a photo, include likely camera details such as aperture, shutter speed, ISO, etc.",
342
+ "Do NOT include anything sexual; keep it PG.",
343
+ "Do NOT mention the image's resolution.",
344
+ "Include information about the subjective aesthetic quality of the image.",
345
+ "Include information on the image's composition style.",
346
+ "Do NOT mention any text that is in the image.",
347
+ "Specify the depth of field and focus.",
348
+ "Mention the likely use of artificial or natural lighting sources.",
349
+ "ONLY describe the most important elements of the image."
350
+ ],
351
+ label="Additional Options"
352
+ )
353
+
354
+ custom_prompt = gr.Textbox(label="Custom Prompt (optional, will override other settings)")
355
+ gr.Markdown("**Note:** Custom prompts may not work with all models and settings.")
356
+
357
+ generate_btn = gr.Button("Generate Description", variant="primary")
358
 
359
  with gr.Column(scale=1):
360
+ output_text = gr.Textbox(label="Generated Description", lines=25)
361
 
362
+ # Set up event handlers
363
+ generate_btn.click(
364
  fn=generate_advanced_description,
365
+ inputs=[
366
+ input_image,
367
+ caption_type,
368
+ caption_length,
369
+ detail_slider,
370
+ emotion_slider,
371
+ style_slider,
372
+ extra_options,
373
+ custom_prompt
374
+ ],
375
  outputs=output_text
376
  )
377
 
378
  gr.Markdown("""
379
  ## How to Use
380
  1. Upload an image
381
+ 2. Select the type of caption you want
382
+ 3. Choose a length preference
383
+ 4. Adjust advanced settings if needed:
384
+ - Detail Level: Controls the comprehensiveness of the description
385
+ - Emotion Focus: Emphasizes mood and feelings in the output
386
+ - Style Focus: Emphasizes artistic elements in the output
387
+ 5. Select any additional options you'd like included
388
+ 6. Click "Generate Description"
389
 
390
  ## About
391
+ This application combines multiple image analysis techniques to generate rich,
392
+ detailed descriptions of images. It's especially useful for creating prompts
393
+ for AI image generators like Stable Diffusion, Midjourney, or DALL-E.
394
  """)
395
 
396
  # Launch the app