Update app.py
Browse files
app.py
CHANGED
|
@@ -12,28 +12,147 @@ import torchvision.transforms.functional as TVF
|
|
| 12 |
|
| 13 |
|
| 14 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
| 15 |
-
MODEL_PATH = "
|
| 16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
| 17 |
-
TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
|
| 18 |
CAPTION_TYPE_MAP = {
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
@@ -138,105 +257,122 @@ text_model.eval()
|
|
| 138 |
# Image Adapter
|
| 139 |
print("Loading image adapter")
|
| 140 |
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
|
| 141 |
-
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"
|
| 142 |
image_adapter.eval()
|
| 143 |
image_adapter.to("cuda")
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
|
| 147 |
-
"""
|
| 148 |
-
Preprocess the input image for the CLIP model.
|
| 149 |
-
"""
|
| 150 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
| 151 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
| 152 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
| 153 |
-
|
| 154 |
|
| 155 |
-
|
| 156 |
-
"""
|
| 157 |
-
Generate a caption based on the image features and prompt.
|
| 158 |
-
"""
|
| 159 |
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
| 163 |
|
|
|
|
| 164 |
inputs_embeds = torch.cat([
|
| 165 |
-
embedded_bos.expand(
|
| 166 |
-
|
| 167 |
-
prompt_embeds.expand(
|
| 168 |
-
eot_embed.expand(
|
| 169 |
], dim=1)
|
| 170 |
|
| 171 |
input_ids = torch.cat([
|
| 172 |
-
torch.tensor([[
|
| 173 |
-
torch.zeros((1,
|
| 174 |
prompt,
|
| 175 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
| 176 |
], dim=1).to('cuda')
|
| 177 |
attention_mask = torch.ones_like(input_ids)
|
| 178 |
|
| 179 |
-
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=
|
| 180 |
|
|
|
|
| 181 |
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
| 182 |
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
| 183 |
generate_ids = generate_ids[:, :-1]
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
@spaces.GPU()
|
| 188 |
-
@torch.no_grad()
|
| 189 |
-
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, lens_type: str = "", film_stock: str = "", composition_style: str = "", lighting_aspect: str = "", special_technique: str = "", color_effect: str = "") -> str:
|
| 190 |
-
"""
|
| 191 |
-
Generate a caption or style prompt based on the input image and parameters.
|
| 192 |
-
"""
|
| 193 |
-
torch.cuda.empty_cache()
|
| 194 |
-
|
| 195 |
-
try:
|
| 196 |
-
length = None if caption_length == "any" else caption_length
|
| 197 |
-
if isinstance(length, str):
|
| 198 |
-
length = int(length)
|
| 199 |
-
except ValueError:
|
| 200 |
-
raise ValueError(f"Invalid caption length: {caption_length}")
|
| 201 |
-
|
| 202 |
-
if caption_type in ["rng-tags", "training_prompt", "style_prompt"]:
|
| 203 |
-
caption_tone = "formal"
|
| 204 |
-
|
| 205 |
-
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
|
| 206 |
-
if prompt_key not in CAPTION_TYPE_MAP:
|
| 207 |
-
raise ValueError(f"Invalid caption type: {prompt_key}")
|
| 208 |
-
|
| 209 |
-
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
|
| 210 |
-
|
| 211 |
-
if caption_type == "style_prompt":
|
| 212 |
-
prompt_str += f" Lens type: {lens_type} ({lens_types_info[lens_type]}). "
|
| 213 |
-
prompt_str += f"Film stock: {film_stocks_info[film_stock]}). "
|
| 214 |
-
prompt_str += f"Composition style: {composition_styles_info[composition_style]}). "
|
| 215 |
-
prompt_str += f"Lighting aspect: {lighting_aspects_info[lighting_aspect]}). "
|
| 216 |
-
prompt_str += f"Special technique: {special_techniques_info[special_technique]}). "
|
| 217 |
-
prompt_str += f"Color effect: {color_effects_info[color_effect]})."
|
| 218 |
-
|
| 219 |
-
# Debugging: Print the constructed prompt string
|
| 220 |
-
print(f"Constructed Prompt: {prompt_str}")
|
| 221 |
-
|
| 222 |
-
pixel_values = preprocess_image(input_image)
|
| 223 |
-
|
| 224 |
-
with torch.amp.autocast_mode.autocast('cuda', enabled=True):
|
| 225 |
-
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
|
| 226 |
-
image_features = vision_outputs.hidden_states
|
| 227 |
-
embedded_images = image_adapter(image_features)
|
| 228 |
-
embedded_images = embedded_images.to('cuda')
|
| 229 |
-
|
| 230 |
-
# Load the model from MODEL_PATH
|
| 231 |
-
text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
|
| 232 |
-
text_model.eval()
|
| 233 |
-
|
| 234 |
-
# Debugging: Print the prompt string before passing to generate_caption
|
| 235 |
-
print(f"Prompt passed to generate_caption: {prompt_str}")
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
return caption
|
| 240 |
|
| 241 |
css = """
|
| 242 |
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
|
|
@@ -256,110 +392,63 @@ ul, ol {
|
|
| 256 |
}
|
| 257 |
"""
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
"
|
| 262 |
-
"
|
| 263 |
-
"
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
"
|
| 268 |
-
"
|
| 269 |
-
"
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
"
|
| 274 |
-
"
|
| 275 |
-
"
|
| 276 |
-
"
|
| 277 |
-
"
|
| 278 |
-
"
|
| 279 |
-
"
|
| 280 |
-
"
|
| 281 |
-
"
|
| 282 |
-
"
|
| 283 |
-
"
|
| 284 |
-
"
|
| 285 |
-
"
|
| 286 |
-
"
|
| 287 |
-
"Kodachrome": "Color photos with moderate grain. Colors on either colder part of spectrum or regular, with normal or slightly higher saturation.",
|
| 288 |
-
"Rollei": "Mostly black and white photos, sometimes color with fine/low grain. Can be sepia colored or have unusual hues and desaturation. Great for landscapes."
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
composition_styles_info = {
|
| 292 |
-
"Rule of Thirds": "Divides the frame into a 3x3 grid, placing key elements along the lines or at their intersections.",
|
| 293 |
-
"Golden Ratio": "Uses a spiral based on the golden ratio to create a balanced and aesthetically pleasing composition.",
|
| 294 |
-
"Symmetry": "Creates a mirror-like balance in the image, often used for architectural or nature photography.",
|
| 295 |
-
"Leading Lines": "Uses lines within the frame to draw the viewer's eye to the main subject or through the image.",
|
| 296 |
-
"Framing": "Uses elements within the scene to create a frame around the main subject.",
|
| 297 |
-
"Minimalism": "Simplifies the composition to its essential elements, often with a lot of negative space.",
|
| 298 |
-
"Fill the Frame": "The main subject dominates the entire frame, leaving little to no background.",
|
| 299 |
-
"Negative Space": "Uses empty space around the subject to create a sense of simplicity or isolation.",
|
| 300 |
-
"Centered Composition": "Places the main subject in the center of the frame, creating a sense of stability or importance.",
|
| 301 |
-
"Diagonal Lines": "Uses diagonal elements to create a sense of movement or dynamic tension in the image.",
|
| 302 |
-
"Triangular Composition": "Arranges elements in the frame to form a triangle, creating a sense of stability and harmony.",
|
| 303 |
-
"Radial Balance": "Arranges elements in a circular pattern around a central point, creating a sense of movement or completeness."
|
| 304 |
}
|
| 305 |
|
| 306 |
-
|
| 307 |
-
"
|
| 308 |
-
"
|
| 309 |
-
"
|
| 310 |
-
"
|
| 311 |
-
"
|
| 312 |
-
"
|
| 313 |
-
"
|
| 314 |
-
"
|
| 315 |
-
"
|
| 316 |
-
"
|
| 317 |
-
"
|
| 318 |
-
"
|
| 319 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
}
|
| 321 |
|
| 322 |
-
special_techniques_info = {
|
| 323 |
-
"Double exposure": "Superimposes two exposures to create a single image, often resulting in a dreamy or surreal effect.",
|
| 324 |
-
"Long exposure": "Uses a long shutter speed to capture motion over time, often creating smooth, blurred effects for moving elements.",
|
| 325 |
-
"Multiple exposure": "Superimposes multiple exposures, multiplying the subject or its key elements across the image.",
|
| 326 |
-
"HDR": "High Dynamic Range imaging, combining multiple exposures to capture a wider range of light and dark tones.",
|
| 327 |
-
"Bokeh effect": "Creates a soft, out-of-focus background, often with circular highlights.",
|
| 328 |
-
"Silhouette": "Captures the outline of a subject against a brighter background, creating a dramatic contrast.",
|
| 329 |
-
"Panning": "Follows a moving subject with the camera, creating a sharp subject with a blurred background.",
|
| 330 |
-
"Light painting": "Uses long exposure and moving light sources to 'paint' with light in the image.",
|
| 331 |
-
"Infrared photography": "Captures light in the infrared spectrum, often resulting in surreal, otherworldly images.",
|
| 332 |
-
"Ultraviolet photography": "Captures light in the ultraviolet spectrum, often revealing hidden patterns or creating a strong violet glow.",
|
| 333 |
-
"Kirlian photography": "High-voltage photographic technique that captures corona discharges around objects, creating a glowing effect.",
|
| 334 |
-
"Thermography": "Captures infrared radiation to create images based on temperature differences, resulting in false-color heat maps.",
|
| 335 |
-
"Astrophotography": "Specialized technique for capturing astronomical objects and celestial events, often resulting in stunning starry backgrounds.",
|
| 336 |
-
"Underwater photography": "Captures images beneath the surface of water, often in pools, seas, or aquariums.",
|
| 337 |
-
"Aerial photography": "Captures images from an elevated position, such as from drones, helicopters, or planes.",
|
| 338 |
-
"Macro photography": "Extreme close-up photography, revealing tiny details not visible to the naked eye."
|
| 339 |
-
}
|
| 340 |
-
|
| 341 |
-
color_effects_info = {
|
| 342 |
-
"Black and white": "Removes all color, leaving only shades of gray.",
|
| 343 |
-
"Sepia": "Reddish-brown monochrome effect, often associated with vintage photography.",
|
| 344 |
-
"Monochrome": "Uses variations of a single color.",
|
| 345 |
-
"Vintage color": "Muted or faded color palette reminiscent of old photographs.",
|
| 346 |
-
"Cross-processed": "Deliberate processing of film in the wrong chemicals, creating unusual color shifts.",
|
| 347 |
-
"Desaturated": "Reduces the intensity of all colors in the image.",
|
| 348 |
-
"Vivid colors": "Increases the saturation and intensity of colors.",
|
| 349 |
-
"Pastel colors": "Soft, pale colors with a light and airy feel.",
|
| 350 |
-
"High contrast": "Emphasizes the difference between light and dark areas in the image.",
|
| 351 |
-
"Low contrast": "Reduces the difference between light and dark areas, creating a softer look.",
|
| 352 |
-
"Color splash": "Converts most of the image to black and white while leaving one or more elements in color."
|
| 353 |
-
}
|
| 354 |
-
|
| 355 |
-
def get_dropdown_choices(info_dict):
|
| 356 |
-
return [f"{key}: {value}" for key, value in info_dict.items()]
|
| 357 |
-
|
| 358 |
-
# Gradio interface
|
| 359 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
| 360 |
with gr.Tab("Welcome"):
|
| 361 |
gr.Markdown(
|
| 362 |
-
|
| 363 |
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
|
| 364 |
|
| 365 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
|
@@ -367,7 +456,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
| 367 |
## Accelerate Your Creative Workflow with Intelligent Image Analysis
|
| 368 |
|
| 369 |
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
|
| 370 |
-
training prompts,
|
| 371 |
|
| 372 |
## 🚀 How It Works:
|
| 373 |
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
|
|
@@ -376,147 +465,109 @@ with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
|
| 376 |
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
|
| 377 |
"""
|
| 378 |
)
|
| 379 |
-
|
| 380 |
with gr.Tab("JoyCaption"):
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
# How to Use JoyCaption
|
| 384 |
-
|
| 385 |
-
Hello, artist! Let's make some fun captions for your pictures. Here's how:
|
| 386 |
-
|
| 387 |
-
1. **Pick a Picture**: Find a cool picture you want to talk about and upload it.
|
| 388 |
-
|
| 389 |
-
2. **Choose What You Want**:
|
| 390 |
-
- **Caption Type**:
|
| 391 |
-
* "Descriptive" tells you what's in the picture
|
| 392 |
-
* "Training Prompt" helps computers make similar pictures
|
| 393 |
-
* "RNG-Tags" gives you short words about the picture
|
| 394 |
-
* "Style Prompt" creates detailed prompts for image generation
|
| 395 |
|
| 396 |
-
|
| 397 |
-
- "Formal" sounds like a teacher talking
|
| 398 |
-
- "Informal" sounds like a friend chatting
|
| 399 |
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
5. **Advanced Options** (for "Style Prompt" only):
|
| 406 |
-
- Choose lens type, film stock, composition, and lighting details
|
| 407 |
-
|
| 408 |
-
6. **Make the Caption**: Click the "Make My Caption!" button and watch the magic happen!
|
| 409 |
-
|
| 410 |
-
Remember, have fun and be creative with your captions!
|
| 411 |
-
|
| 412 |
-
## Tips for Great Captions:
|
| 413 |
-
- Try different types to see what you like best
|
| 414 |
-
- Experiment with formal and informal tones for fun variations
|
| 415 |
-
- Adjust the length to get just the right amount of detail
|
| 416 |
-
- For "Style Prompt", play with the advanced options for more specific results
|
| 417 |
-
- If you don't like a caption, just click "Make My Caption!" again for a new one
|
| 418 |
-
|
| 419 |
-
Have a great time captioning your art!
|
| 420 |
-
""")
|
| 421 |
|
| 422 |
with gr.Row():
|
| 423 |
-
with gr.Column():
|
| 424 |
-
input_image = gr.Image(type="pil", label="
|
| 425 |
|
| 426 |
caption_type = gr.Dropdown(
|
| 427 |
-
choices=[
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
value="descriptive",
|
| 430 |
)
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
caption_tone = gr.Dropdown(
|
| 433 |
choices=["formal", "informal"],
|
| 434 |
-
label="
|
| 435 |
value="formal",
|
| 436 |
)
|
| 437 |
|
|
|
|
|
|
|
| 438 |
caption_length = gr.Dropdown(
|
| 439 |
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
| 440 |
[str(i) for i in range(20, 261, 10)],
|
| 441 |
-
label="
|
| 442 |
value="any",
|
| 443 |
)
|
| 444 |
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
)
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
choices=
|
| 454 |
-
label="
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
)
|
| 458 |
-
|
| 459 |
-
composition_style = gr.Dropdown(
|
| 460 |
-
choices=get_dropdown_choices(composition_styles_info),
|
| 461 |
-
label="Composition Style",
|
| 462 |
-
visible=False,
|
| 463 |
-
info="Select a composition style to guide the arrangement of elements in the image."
|
| 464 |
)
|
| 465 |
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
with gr.Column():
|
| 492 |
-
output_caption = gr.Textbox(label="Generated Caption")
|
| 493 |
-
|
| 494 |
-
# Container for advanced options
|
| 495 |
-
advanced_options = gr.Column(visible=False)
|
| 496 |
-
with advanced_options:
|
| 497 |
-
gr.Markdown("### Advanced Options for Style Prompt")
|
| 498 |
-
lens_type.render()
|
| 499 |
-
film_stock.render()
|
| 500 |
-
composition_style.render()
|
| 501 |
-
lighting_aspect.render()
|
| 502 |
-
special_technique.render()
|
| 503 |
-
color_effect.render()
|
| 504 |
-
|
| 505 |
-
def update_style_options(caption_type):
|
| 506 |
-
return {
|
| 507 |
-
lens_type: gr.update(visible=caption_type == "style_prompt"),
|
| 508 |
-
film_stock: gr.update(visible=caption_type == "style_prompt"),
|
| 509 |
-
composition_style: gr.update(visible=caption_type == "style_prompt"),
|
| 510 |
-
lighting_aspect: gr.update(visible=caption_type == "style_prompt"),
|
| 511 |
-
special_technique: gr.update(visible=caption_type == "style_prompt"),
|
| 512 |
-
color_effect: gr.update(visible=caption_type == "style_prompt"),
|
| 513 |
-
advanced_options: gr.update(visible=caption_type == "style_prompt"),
|
| 514 |
-
}
|
| 515 |
-
|
| 516 |
-
caption_type.change(update_style_options, inputs=[caption_type], outputs=[lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect, advanced_options])
|
| 517 |
-
|
| 518 |
-
run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length, lens_type, film_stock, composition_style, lighting_aspect, special_technique, color_effect], outputs=[output_caption])
|
| 519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
if __name__ == "__main__":
|
| 522 |
demo.launch()
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
| 15 |
+
MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
|
| 16 |
CHECKPOINT_PATH = Path("9em124t2-499968")
|
|
|
|
| 17 |
CAPTION_TYPE_MAP = {
|
| 18 |
+
("descriptive", "formal", False, False): [
|
| 19 |
+
"Write a detailed, formal description of this image, focusing on composition, style, and artistic elements.",
|
| 20 |
+
"Provide a comprehensive, academic analysis of this artwork's visual characteristics and techniques."
|
| 21 |
+
],
|
| 22 |
+
("descriptive", "formal", False, True): [
|
| 23 |
+
"Craft a formal, concise description of this image within {word_count} words, highlighting key visual elements.",
|
| 24 |
+
"Summarize the artwork's main features and style in a formal tone, using no more than {word_count} words."
|
| 25 |
+
],
|
| 26 |
+
("descriptive", "formal", True, False): [
|
| 27 |
+
"Compose a {length} formal critique of this image, discussing its artistic merits and visual impact.",
|
| 28 |
+
"Create a {length} scholarly description of this artwork, analyzing its composition and aesthetic qualities."
|
| 29 |
+
],
|
| 30 |
+
("descriptive", "informal", False, False): [
|
| 31 |
+
"Describe this image as if you're explaining it to a friend, focusing on what stands out to you.",
|
| 32 |
+
"Give a casual, conversational rundown of what you see in this artwork and how it makes you feel."
|
| 33 |
+
],
|
| 34 |
+
("descriptive", "informal", False, True): [
|
| 35 |
+
"In about {word_count} words, give a laid-back description of this image's vibe and key features.",
|
| 36 |
+
"Summarize the coolest parts of this artwork in a casual tone, using roughly {word_count} words."
|
| 37 |
+
],
|
| 38 |
+
("descriptive", "informal", True, False): [
|
| 39 |
+
"Write a {length} chill description of this image, highlighting what you find most interesting or unique.",
|
| 40 |
+
"Give a {length} relaxed explanation of what's going on in this artwork and why it catches your eye."
|
| 41 |
+
],
|
| 42 |
+
("training_prompt", "formal", False, False): [
|
| 43 |
+
"Generate a detailed stable diffusion prompt to recreate this image, including style, composition, and key elements.",
|
| 44 |
+
"Craft a comprehensive prompt for an AI art generator to produce an image in the same style and mood as this artwork."
|
| 45 |
+
],
|
| 46 |
+
("training_prompt", "formal", False, True): [
|
| 47 |
+
"Within {word_count} words, create a precise stable diffusion prompt capturing the essence of this image.",
|
| 48 |
+
"Compose a concise AI art prompt of {word_count} words to replicate this artwork's style and content."
|
| 49 |
+
],
|
| 50 |
+
("training_prompt", "formal", True, False): [
|
| 51 |
+
"Write a {length} stable diffusion prompt that thoroughly describes this image's style, subject, and artistic techniques.",
|
| 52 |
+
"Develop a {length} detailed prompt for AI art generation, breaking down the key visual elements and artistic approach of this image."
|
| 53 |
+
],
|
| 54 |
+
("rng-tags", "formal", False, False): [
|
| 55 |
+
"Generate a comprehensive list of Booru tags describing this image's content, style, and artistic elements.",
|
| 56 |
+
"Create an extensive set of Booru tags covering all aspects of this artwork, including subject, technique, and mood."
|
| 57 |
+
],
|
| 58 |
+
("rng-tags", "formal", False, True): [
|
| 59 |
+
"Produce a focused list of Booru tags within {word_count} words, capturing the most important aspects of this image.",
|
| 60 |
+
"Compile a concise set of Booru tags, limited to {word_count} words, that best represent this artwork's key features."
|
| 61 |
+
],
|
| 62 |
+
("rng-tags", "formal", True, False): [
|
| 63 |
+
"Generate a {length} list of Booru tags, providing a thorough categorization of this image's content and style.",
|
| 64 |
+
"Create a {length} set of Booru tags that extensively describe all visual elements and artistic choices in this artwork."
|
| 65 |
+
],
|
| 66 |
+
("artistic_inspiration", "formal", False, False): [
|
| 67 |
+
"Analyze this image and suggest artistic variations or extensions that could be created based on its style and theme.",
|
| 68 |
+
"Provide a formal interpretation of this artwork's mood and style, offering ideas for complementary pieces or a series."
|
| 69 |
+
],
|
| 70 |
+
("artistic_inspiration", "informal", False, False): [
|
| 71 |
+
"Brainstorm some cool ideas for new artworks inspired by this image's style or subject matter.",
|
| 72 |
+
"Riff on this artwork's vibe and come up with some creative spin-offs or related pieces an artist could make."
|
| 73 |
+
],
|
| 74 |
+
("technical_breakdown", "formal", False, False): [
|
| 75 |
+
"Provide a detailed technical analysis of the artistic techniques and materials likely used to create this image.",
|
| 76 |
+
"Break down the compositional elements and artistic methods employed in this artwork, suitable for an art student's study."
|
| 77 |
+
],
|
| 78 |
+
("emotional_response", "informal", False, False): [
|
| 79 |
+
"Describe the emotions and feelings this artwork evokes, and explain why it might resonate with viewers.",
|
| 80 |
+
"Share your gut reaction to this image and speculate on what the artist might have been feeling or thinking."
|
| 81 |
+
],
|
| 82 |
+
|
| 83 |
+
("thematic_analysis", "formal", False, False): [
|
| 84 |
+
"Provide an in-depth analysis of the themes presented in this image, exploring the underlying messages and concepts.",
|
| 85 |
+
"Analyze the primary and secondary themes of this artwork, discussing their significance and interplay."
|
| 86 |
+
],
|
| 87 |
+
("thematic_analysis", "formal", False, True): [
|
| 88 |
+
"Within {word_count} words, dissect the main themes of this image, highlighting their relevance and impact.",
|
| 89 |
+
"Craft a concise thematic analysis of this artwork in {word_count} words, focusing on its core messages."
|
| 90 |
+
],
|
| 91 |
+
("thematic_analysis", "formal", True, False): [
|
| 92 |
+
"Write a {length} formal exploration of the themes depicted in this image, examining their depth and meaning.",
|
| 93 |
+
"Develop a {length} scholarly analysis of the thematic elements in this artwork, discussing their significance."
|
| 94 |
+
],
|
| 95 |
+
("stylistic_comparison", "informal", False, False): [
|
| 96 |
+
"Compare the style of this image to other famous art movements or artists, highlighting similarities and differences.",
|
| 97 |
+
"Describe how this artwork's style relates to [specific artist/style], and what makes it unique."
|
| 98 |
+
],
|
| 99 |
+
("stylistic_comparison", "informal", False, True): [
|
| 100 |
+
"In about {word_count} words, compare this image's style with other known art styles or artists.",
|
| 101 |
+
"Summarize the stylistic similarities and differences of this artwork compared to other genres in {word_count} words."
|
| 102 |
+
],
|
| 103 |
+
("stylistic_comparison", "informal", True, False): [
|
| 104 |
+
"Write a {length} casual comparison of this image's style with other art movements or famous artists.",
|
| 105 |
+
"Give a {length} relaxed description of how this artwork's style aligns or differs from other genres."
|
| 106 |
+
],
|
| 107 |
+
("narrative_suggestion", "formal", False, False): [
|
| 108 |
+
"Create a short narrative inspired by this image, outlining a possible story that reflects its visual elements.",
|
| 109 |
+
"Develop a brief storyline that complements the themes and mood depicted in this artwork."
|
| 110 |
+
],
|
| 111 |
+
("narrative_suggestion", "formal", False, True): [
|
| 112 |
+
"Within {word_count} words, outline a narrative inspired by this image's visual elements and mood.",
|
| 113 |
+
"Compose a concise story idea based on the themes and composition of this artwork in {word_count} words."
|
| 114 |
+
],
|
| 115 |
+
("narrative_suggestion", "formal", True, False): [
|
| 116 |
+
"Write a {length} formal narrative inspired by this image, detailing a story that aligns with its visual and thematic elements.",
|
| 117 |
+
"Develop a {length} scholarly storyline that reflects the mood and composition of this artwork."
|
| 118 |
+
],
|
| 119 |
+
("contextual_storytelling", "informal", False, False): [
|
| 120 |
+
"Tell a cool story that could be happening in the scene of this image, based on its visual cues.",
|
| 121 |
+
"Imagine a background story for this artwork, explaining what's happening and why."
|
| 122 |
+
],
|
| 123 |
+
("contextual_storytelling", "informal", False, True): [
|
| 124 |
+
"In about {word_count} words, create a backstory for the scene depicted in this image.",
|
| 125 |
+
"Summarize a possible background narrative for this artwork in {word_count} words."
|
| 126 |
+
],
|
| 127 |
+
("contextual_storytelling", "informal", True, False): [
|
| 128 |
+
"Write a {length} informal story that provides context to the scene portrayed in this image.",
|
| 129 |
+
"Give a {length} casual backstory explaining the events depicted in this artwork."
|
| 130 |
+
],
|
| 131 |
+
|
| 132 |
+
("style_prompt", "formal", False, False): [
|
| 133 |
+
"Analyze this image through the lens of {style} art. Describe how it aligns with or diverges from {style_characteristics}.",
|
| 134 |
+
"Examine this artwork in the context of the {style} movement, focusing on {style_focus} and how these elements are represented or reinterpreted in the image."
|
| 135 |
+
],
|
| 136 |
+
("style_prompt", "formal", False, True): [
|
| 137 |
+
"Within {word_count} words, compare this image to the {style} style, highlighting elements that reflect or contrast with its key characteristics.",
|
| 138 |
+
"Compose a concise {word_count}-word analysis of how this artwork relates to the {style} movement, noting its adherence to or departure from typical {style} elements."
|
| 139 |
+
],
|
| 140 |
+
("style_prompt", "formal", True, False): [
|
| 141 |
+
"Write a {length} critique of this image, exploring its relationship to the {style} movement. Discuss composition, technique, and thematic elements in this context.",
|
| 142 |
+
"Develop a {length} analysis of how this artwork incorporates or challenges the principles of {style}, considering its visual language and artistic approach."
|
| 143 |
+
],
|
| 144 |
+
("style_prompt", "informal", False, False): [
|
| 145 |
+
"Imagine this image is in an exhibition of {style} art. Describe what makes it fit in or stand out from other {style} pieces.",
|
| 146 |
+
"Give a casual rundown of how this artwork vibes with the {style} movement. What's similar? What's different? What's cool about it?"
|
| 147 |
+
],
|
| 148 |
+
("style_prompt", "informal", False, True): [
|
| 149 |
+
"In about {word_count} words, chat about how this image relates to {style} art. What catches your eye as typical or unusual for the style?",
|
| 150 |
+
"Summarize in roughly {word_count} words how this artwork plays with {style} ideas. What's familiar? What's a twist on the style?"
|
| 151 |
+
],
|
| 152 |
+
("style_prompt", "informal", True, False): [
|
| 153 |
+
"Write a {length} chill analysis of this image as if it's part of a {style} art show. What works? What's surprising? How does it make you feel?",
|
| 154 |
+
"Give a {length} relaxed breakdown of how this artwork fits (or doesn't) into the {style} scene. What's your take on its use of {style} elements?"
|
| 155 |
+
],
|
| 156 |
}
|
| 157 |
|
| 158 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
|
| 257 |
# Image Adapter
|
| 258 |
print("Loading image adapter")
|
| 259 |
image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
|
| 260 |
+
image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
|
| 261 |
image_adapter.eval()
|
| 262 |
image_adapter.to("cuda")
|
| 263 |
|
| 264 |
+
# After loading the tokenizer and model
|
| 265 |
+
print(f"Tokenizer class: {type(tokenizer)}")
|
| 266 |
+
print(f"BOS token: {tokenizer.bos_token}")
|
| 267 |
+
print(f"BOS token ID: {tokenizer.bos_token_id}")
|
| 268 |
+
print(f"EOS token: {tokenizer.eos_token}")
|
| 269 |
+
print(f"EOS token ID: {tokenizer.eos_token_id}")
|
| 270 |
+
print(f"Text model device: {text_model.device}")
|
| 271 |
+
|
| 272 |
+
# Ensure the tokenizer has the necessary special tokens
|
| 273 |
+
if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
|
| 274 |
+
print("Warning: BOS or EOS token is missing. Adding default tokens.")
|
| 275 |
+
special_tokens_dict = {}
|
| 276 |
+
if tokenizer.bos_token_id is None:
|
| 277 |
+
special_tokens_dict['bos_token'] = '<|endoftext|>'
|
| 278 |
+
if tokenizer.eos_token_id is None:
|
| 279 |
+
special_tokens_dict['eos_token'] = '<|endoftext|>'
|
| 280 |
+
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
|
| 281 |
+
print(f"Added {num_added_tokens} special tokens to the tokenizer.")
|
| 282 |
+
|
| 283 |
+
# Resize token embeddings of the model if new tokens are added
|
| 284 |
+
text_model.resize_token_embeddings(len(tokenizer))
|
| 285 |
+
|
| 286 |
+
@spaces.GPU()
|
| 287 |
+
@torch.no_grad()
|
| 288 |
+
def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
|
| 289 |
+
torch.cuda.empty_cache()
|
| 290 |
+
|
| 291 |
+
# Handle caption_length
|
| 292 |
+
length = None
|
| 293 |
+
if caption_length != "any":
|
| 294 |
+
if isinstance(caption_length, int):
|
| 295 |
+
length = caption_length
|
| 296 |
+
elif isinstance(caption_length, str):
|
| 297 |
+
try:
|
| 298 |
+
length = int(caption_length)
|
| 299 |
+
except ValueError:
|
| 300 |
+
# If it's not a number, treat it as a descriptive length
|
| 301 |
+
length = caption_length
|
| 302 |
+
|
| 303 |
+
# 'rng-tags' and 'training_prompt' don't have formal/informal tones
|
| 304 |
+
if caption_type in ["rng-tags", "training_prompt"]:
|
| 305 |
+
caption_tone = "formal"
|
| 306 |
+
|
| 307 |
+
# Build prompt
|
| 308 |
+
prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
|
| 309 |
+
if prompt_key not in CAPTION_TYPE_MAP:
|
| 310 |
+
raise ValueError(f"Invalid caption type: {prompt_key}")
|
| 311 |
+
|
| 312 |
+
prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
|
| 313 |
+
length=length,
|
| 314 |
+
word_count=length,
|
| 315 |
+
style=art_style,
|
| 316 |
+
style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
|
| 317 |
+
style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
|
| 318 |
+
)
|
| 319 |
+
print(f"Prompt: {prompt_str}")
|
| 320 |
|
| 321 |
+
# Preprocess image
|
|
|
|
|
|
|
|
|
|
| 322 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
| 323 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
| 324 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
| 325 |
+
pixel_values = pixel_values.to('cuda')
|
| 326 |
|
| 327 |
+
# Tokenize the prompt
|
|
|
|
|
|
|
|
|
|
| 328 |
prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
|
| 329 |
+
|
| 330 |
+
# Embed image
|
| 331 |
+
with torch.amp.autocast_mode.autocast('cuda', enabled=True):
|
| 332 |
+
vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
|
| 333 |
+
image_features = vision_outputs.hidden_states
|
| 334 |
+
embedded_images = image_adapter(image_features)
|
| 335 |
+
embedded_images = embedded_images.to('cuda')
|
| 336 |
+
|
| 337 |
+
# Embed prompt
|
| 338 |
prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
|
| 339 |
+
assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
|
| 340 |
+
|
| 341 |
+
# Check for bos_token_id and provide a fallback
|
| 342 |
+
bos_token_id = tokenizer.bos_token_id
|
| 343 |
+
if bos_token_id is None:
|
| 344 |
+
print("Warning: bos_token_id is None. Using default value of 1.")
|
| 345 |
+
bos_token_id = 1 # Common default, but may need adjustment
|
| 346 |
+
|
| 347 |
+
embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
|
| 348 |
eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
|
| 349 |
|
| 350 |
+
# Construct prompts
|
| 351 |
inputs_embeds = torch.cat([
|
| 352 |
+
embedded_bos.expand(embedded_images.shape[0], -1, -1),
|
| 353 |
+
embedded_images.to(dtype=embedded_bos.dtype),
|
| 354 |
+
prompt_embeds.expand(embedded_images.shape[0], -1, -1),
|
| 355 |
+
eot_embed.expand(embedded_images.shape[0], -1, -1),
|
| 356 |
], dim=1)
|
| 357 |
|
| 358 |
input_ids = torch.cat([
|
| 359 |
+
torch.tensor([[bos_token_id]], dtype=torch.long),
|
| 360 |
+
torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
|
| 361 |
prompt,
|
| 362 |
torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
|
| 363 |
], dim=1).to('cuda')
|
| 364 |
attention_mask = torch.ones_like(input_ids)
|
| 365 |
|
| 366 |
+
generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
|
| 367 |
|
| 368 |
+
# Trim off the prompt
|
| 369 |
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
| 370 |
if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
| 371 |
generate_ids = generate_ids[:, :-1]
|
| 372 |
|
| 373 |
+
caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
+
return caption.strip()
|
|
|
|
|
|
|
| 376 |
|
| 377 |
css = """
|
| 378 |
h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {
|
|
|
|
| 392 |
}
|
| 393 |
"""
|
| 394 |
|
| 395 |
+
ART_STYLES = [
|
| 396 |
+
"Impressionism", "Cubism", "Surrealism", "Abstract Expressionism", "Pop Art",
|
| 397 |
+
"Minimalism", "Baroque", "Renaissance", "Art Nouveau", "Gothic",
|
| 398 |
+
"Romanticism", "Realism", "Expressionism", "Fauvism", "Art Deco",
|
| 399 |
+
"Futurism", "Dadaism", "Pointillism", "Rococo", "Neoclassicism"
|
| 400 |
+
]
|
| 401 |
+
|
| 402 |
+
STYLE_CHARACTERISTICS = {
|
| 403 |
+
"Impressionism": "loose brushstrokes, emphasis on light and color, everyday subjects",
|
| 404 |
+
"Cubism": "geometric shapes, multiple perspectives, fragmented forms",
|
| 405 |
+
"Surrealism": "dreamlike imagery, unexpected juxtapositions, subconscious exploration",
|
| 406 |
+
"Abstract Expressionism": "expressive brushwork, emotional content, abstract forms",
|
| 407 |
+
"Pop Art": "bright colors, popular culture references, satire",
|
| 408 |
+
"Minimalism": "simple forms, limited color palette, emphasis on space",
|
| 409 |
+
"Baroque": "dramatic lighting, elaborate detail, grandeur",
|
| 410 |
+
"Renaissance": "realistic depictions, perspective, religious themes",
|
| 411 |
+
"Art Nouveau": "stylized forms, organic shapes, decorative elements",
|
| 412 |
+
"Gothic": "dark themes, dramatic lighting, architectural elements",
|
| 413 |
+
"Romanticism": "emotional content, nature scenes, idealized figures",
|
| 414 |
+
"Realism": "detailed depictions, realistic textures, everyday subjects",
|
| 415 |
+
"Expressionism": "emotional content, distorted forms, abstract elements",
|
| 416 |
+
"Fauvism": "bold colors, abstract forms, emotional content",
|
| 417 |
+
"Art Deco": "geometric shapes, streamlined forms, modern aesthetics",
|
| 418 |
+
"Futurism": "dynamic forms, speed, technology",
|
| 419 |
+
"Dadaism": "anti-art, absurdity, subversion of traditional art",
|
| 420 |
+
"Pointillism": "small dots of color, impressionistic style, emphasis on light",
|
| 421 |
+
"Rococo": "ornate style, lighthearted themes, decorative elements",
|
| 422 |
+
"Neoclassicism": "classical style, balance, symmetry"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
}
|
| 424 |
|
| 425 |
+
STYLE_FOCUS = {
|
| 426 |
+
"Impressionism": "capturing fleeting moments and atmospheric effects",
|
| 427 |
+
"Cubism": "deconstructing and reassembling forms from multiple viewpoints",
|
| 428 |
+
"Surrealism": "creating a sense of the uncanny and exploring the subconscious mind",
|
| 429 |
+
"Abstract Expressionism": "expressing emotional content through abstract forms",
|
| 430 |
+
"Pop Art": "commenting on popular culture and satirizing consumerism",
|
| 431 |
+
"Minimalism": "exploring the relationship between form and space",
|
| 432 |
+
"Baroque": "creating dramatic and grandiose compositions",
|
| 433 |
+
"Renaissance": "depicting realistic scenes and exploring perspective",
|
| 434 |
+
"Art Nouveau": "incorporating organic and decorative elements",
|
| 435 |
+
"Gothic": "exploring dark themes and dramatic lighting",
|
| 436 |
+
"Romanticism": "depicting emotional scenes and idealized figures",
|
| 437 |
+
"Realism": "capturing detailed and realistic textures",
|
| 438 |
+
"Expressionism": "expressing emotional content through distorted forms",
|
| 439 |
+
"Fauvism": "emphasizing bold colors and emotional content",
|
| 440 |
+
"Art Deco": "incorporating geometric shapes and modern aesthetics",
|
| 441 |
+
"Futurism": "depicting speed, technology, and dynamism",
|
| 442 |
+
"Dadaism": "subverting traditional art and exploring absurdity",
|
| 443 |
+
"Pointillism": "capturing light and color through small dots",
|
| 444 |
+
"Rococo": "creating lighthearted and decorative compositions",
|
| 445 |
+
"Neoclassicism": "achieving balance and symmetry in classical style"
|
| 446 |
}
|
| 447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
with gr.Blocks(theme="Hev832/Applio", css=css) as demo:
|
| 449 |
with gr.Tab("Welcome"):
|
| 450 |
gr.Markdown(
|
| 451 |
+
"""
|
| 452 |
<img src="https://path-to-yamamoto-logo.png" alt="Yamamoto Logo" class="centered-image">
|
| 453 |
|
| 454 |
# 🎨 Yamamoto JoyCaption: AI-Powered Art Inspiration
|
|
|
|
| 456 |
## Accelerate Your Creative Workflow with Intelligent Image Analysis
|
| 457 |
|
| 458 |
This innovative tool empowers Yamamoto's artists to quickly generate descriptive captions,<br>
|
| 459 |
+
training prompts, or tags from existing artwork, fueling the creative process for GenAI models.
|
| 460 |
|
| 461 |
## 🚀 How It Works:
|
| 462 |
1. **Upload Your Inspiration**: Drop in an image (e.g., a charcoal horse picture) that embodies your desired style.
|
|
|
|
| 465 |
4. **Generate and Iterate**: Click 'Caption' to analyze your image and use the results to inspire new creations.
|
| 466 |
"""
|
| 467 |
)
|
| 468 |
+
|
| 469 |
with gr.Tab("JoyCaption"):
|
| 470 |
+
gr.Markdown("""
|
| 471 |
+
# JoyCaption: AI-Powered Image Analysis Tool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
+
This tool helps you generate various types of text based on an uploaded image. Here's how to use it:
|
|
|
|
|
|
|
| 474 |
|
| 475 |
+
1. Upload an image
|
| 476 |
+
2. Choose your desired output type
|
| 477 |
+
3. Adjust settings as needed
|
| 478 |
+
4. Click 'Generate Caption' to get your result
|
| 479 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
with gr.Row():
|
| 482 |
+
with gr.Column(scale=1):
|
| 483 |
+
input_image = gr.Image(type="pil", label="Upload Your Image")
|
| 484 |
|
| 485 |
caption_type = gr.Dropdown(
|
| 486 |
+
choices=[
|
| 487 |
+
"descriptive",
|
| 488 |
+
"training_prompt",
|
| 489 |
+
"rng-tags",
|
| 490 |
+
"thematic_analysis",
|
| 491 |
+
"stylistic_comparison",
|
| 492 |
+
"narrative_suggestion",
|
| 493 |
+
"contextual_storytelling",
|
| 494 |
+
"style_prompt"
|
| 495 |
+
],
|
| 496 |
+
label="Output Type",
|
| 497 |
value="descriptive",
|
| 498 |
)
|
| 499 |
|
| 500 |
+
gr.Markdown("""
|
| 501 |
+
### Output Types Explained:
|
| 502 |
+
- **Descriptive**: A general description of the image
|
| 503 |
+
- **Training Prompt**: A prompt for AI image generation
|
| 504 |
+
- **RNG-Tags**: Tags for categorizing the image
|
| 505 |
+
- **Thematic Analysis**: Exploration of themes in the image
|
| 506 |
+
- **Stylistic Comparison**: Compares the image to art styles
|
| 507 |
+
- **Narrative Suggestion**: A story idea based on the image
|
| 508 |
+
- **Contextual Storytelling**: A background story for the image
|
| 509 |
+
- **Style Prompt**: Analyzes the image in context of a specific art style
|
| 510 |
+
""")
|
| 511 |
+
|
| 512 |
caption_tone = gr.Dropdown(
|
| 513 |
choices=["formal", "informal"],
|
| 514 |
+
label="Tone",
|
| 515 |
value="formal",
|
| 516 |
)
|
| 517 |
|
| 518 |
+
gr.Markdown("Choose between a formal (professional) or informal (casual) tone for the output.")
|
| 519 |
+
|
| 520 |
caption_length = gr.Dropdown(
|
| 521 |
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
| 522 |
[str(i) for i in range(20, 261, 10)],
|
| 523 |
+
label="Length",
|
| 524 |
value="any",
|
| 525 |
)
|
| 526 |
|
| 527 |
+
gr.Markdown("""
|
| 528 |
+
Select the desired length of the output:
|
| 529 |
+
- 'any': No specific length
|
| 530 |
+
- Descriptive options: very short to very long
|
| 531 |
+
- Numeric options: Specify exact word count (20 to 260 words)
|
| 532 |
+
""")
|
| 533 |
+
|
| 534 |
+
art_style = gr.Dropdown(
|
| 535 |
+
choices=ART_STYLES,
|
| 536 |
+
label="Art Style (for Style Prompt)",
|
| 537 |
+
value="Impressionism",
|
| 538 |
+
visible=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
)
|
| 540 |
|
| 541 |
+
gr.Markdown("Select an art style to analyze the image in that context. Only applicable for 'Style Prompt' output type.")
|
| 542 |
+
|
| 543 |
+
with gr.Column(scale=1):
|
| 544 |
+
output_caption = gr.Textbox(label="Generated Output", lines=10)
|
| 545 |
+
generate_button = gr.Button("Generate Caption")
|
| 546 |
+
|
| 547 |
+
gr.Markdown("""
|
| 548 |
+
### Additional Notes:
|
| 549 |
+
- The 'Tone' setting doesn't affect 'RNG-Tags' and 'Training Prompt' outputs.
|
| 550 |
+
- 'Art Style' is only used when 'Style Prompt' is selected as the output type.
|
| 551 |
+
- The AI model analyzes the image and generates text based on your selections.
|
| 552 |
+
""")
|
| 553 |
+
|
| 554 |
+
def update_visibility(caption_type):
|
| 555 |
+
return {
|
| 556 |
+
art_style: gr.update(visible=(caption_type == "style_prompt")),
|
| 557 |
+
caption_tone: gr.update(visible=(caption_type not in ["rng-tags", "training_prompt"]))
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
caption_type.change(
|
| 561 |
+
fn=update_visibility,
|
| 562 |
+
inputs=[caption_type],
|
| 563 |
+
outputs=[art_style, caption_tone]
|
| 564 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
+
generate_button.click(
|
| 567 |
+
fn=stream_chat,
|
| 568 |
+
inputs=[input_image, caption_type, caption_tone, caption_length, art_style],
|
| 569 |
+
outputs=[output_caption]
|
| 570 |
+
)
|
| 571 |
|
| 572 |
if __name__ == "__main__":
|
| 573 |
demo.launch()
|