Spaces:
Running on Zero
Running on Zero
Upload app.py with huggingface_hub
Browse files
app.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
|
| 4 |
+
import torch
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from threading import Thread
|
| 7 |
+
from typing import Generator
|
| 8 |
+
from liger_kernel.transformers import apply_liger_kernel_to_llama
|
| 9 |
+
|
| 10 |
+
LOGO_SRC = """data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiIHN0YW5kYWxvbmU9Im5vIj8+CjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+Cjxzdmcgd2lkdGg9IjEwMCUiIGhlaWdodD0iMTAwJSIgdmlld0JveD0iMCAwIDUzOCA1MzUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSIgeG1sbnM6c2VyaWY9Imh0dHA6Ly93d3cuc2VyaWYuY29tLyIgc3R5bGU9ImZpbGwtcnVsZTpldmVub2RkO2NsaXAtcnVsZTpldmVub2RkO3N0cm9rZS1saW5lam9pbjpyb3VuZDtzdHJva2UtbWl0ZXJsaW1pdDoyOyI+CiAgICA8ZyB0cmFuc2Zvcm09Im1hdHJpeCgxLDAsMCwxLC0xNDcuODcxLDAuMDAxOTA4NjMpIj4KICAgICAgICA8cGF0aCBkPSJNMTk1LjY3LDIyMS42N0MxOTYuNzMsMjA1LjM3IDIwMC4yOCwxODkuNzYgMjA3LjkxLDE3NS4zN0MyMjcuOTgsMTM3LjUxIDI1OS4zMywxMTQuODggMzAyLjAxLDExMS42M0MzMzQuMTUsMTA5LjE4IDM2Ni41OSwxMTAuNiAzOTguODksMTEwLjNDNDAwLjUzLDExMC4yOCA0MDIuMTYsMTEwLjMgNDA0LjQsMTEwLjNDNDA0LjQsMTAxLjk5IDQwNC41Niw5NC4wNSA0MDQuMjMsODYuMTJDNDA0LjE4LDg0Ljg0IDQwMi4xNSw4My4xMyA0MDAuNjYsODIuNDlDMzgzLjIzLDc1LjAyIDM3My4wNSw1OS43OSAzNzMuOTYsNDAuOTZDMzc1LjA5LDE3LjU0IDM5MS40NywyLjY2IDQxMC42NSwwLjM3QzQzNy44OSwtMi44OSA0NTUuNTYsMTUuODQgNDU5LjI2LDM0LjY5QzQ2Mi45Niw1My41NyA0NTIuMTgsNzYuOTMgNDMyLjgxLDgyLjY2QzQzMS42NCw4My4wMSA0MzAuMzMsODUuMjMgNDMwLjI4LDg2LjYyQzQzMC4wMyw5NC4yNiA0MzAuMTYsMTAxLjkyIDQzMC4xNiwxMTAuM0w0MzUuNjMsMTEwLjNDNDYzLjc5LDExMC4zIDQ5MS45NiwxMTAuMjggNTIwLjEyLDExMC4zQzU3NC44NCwxMTAuMzYgNjIzLjA0LDE0OC4zNSA2MzUuNjcsMjAxLjU1QzYzNy4yMywyMDguMTMgNjM3LjgzLDIxNC45MyA2MzguODksMjIxLjY3QzY2MC40MywyMjQuOTQgNjc1LjE5LDIzNi42MiA2ODIuMzYsMjU3LjRDNjgzLjU5LDI2MC45NyA2ODQuNjUsMjY0LjgyIDY4NC42NywyNjguNTRDNjg0Ljc3LDI4My4zNCA2ODUuNzYsMjk4LjMxIDY4My45NCwzMTIuOTFDNjgwLjg5LDMzNy4yOSA2NjIuODYsMzUzLjM2IDYzOC40NywzNTUuODJDNjM1LjE0LDM4NS4wOCA2MjEuOTEsNDA5LjQxIDYwMC40NSw0MjkuMjFDNTgxLjYsNDQ2LjYxIDU1OS4xNCw0NTcuNSA1MzMuNTcsNDU5LjE4QzUwOC4xOCw0NjAuODQgNDgyLjY0LDQ2MC4yIDQ1Ny4xNiw0NjAuMzhDNDM1LjE2LDQ2MC41MyA0MTMuMTcsNDYwLjM0IDM5MS4xNyw0NjAuNTNDMzg4Ljc2LDQ2MC41NSAzODUuOTUsNDYxLjU2IDM4NC4wMyw0NjMuMDRDMzcxLjU0LDQ3Mi42MiAzNTkuMTMsNDgyLjMxIDM0Ni45Miw0OTIuMjVDMzM4Ljk0LDQ5OC43NSAzMzEuMzksNTA1Ljc3IDMyMy41Niw1MTIuNDZDMzE3LjQ1LDUxNy42OCAzMTAuOTMsNTIyLjQ0IDMwNS4xMSw1MjcuOTVDMzAxLjE5LDUzMS42NiAyOTYuNTIsNTMzLjE3IDI5MS42OSw1MzQuMzZDMjg1LjY1LDUzNS44NSAyNzkuMjIsNTI5LjEzIDI3OS4wMSw1MjEuMTlDMjc4LjgsNTEyLjg2IDI3OC45NSw1MDQuNTMgMjc4Ljk0LDQ5Ni4xOUwyNzguOTQsNDU2LjY5QzIzMi44Miw0MzguMTYgMjAzLjU2LDQwNi4yMyAxOTUuMDcsMzU2LjA4QzE5My4yNiwzNTUuNzUgMTkwLjg0LDM1NS40MSAxODguNDgsMzU0Ljg2QzE2Ny40NiwzNDkuOTEgMTU1LjA0LDMzNi4wMiAxNTAuNzIsMzE1LjYyQzE0Ni45OCwyOTcuOTkgMTQ2LjksMjc5LjY3IDE1MC42MSwyNjIuMDlDMTU1LjU1LDIzOC42OCAxNzEuNDIsMjI1LjU5IDE5NS42NiwyMjEuNjdMMTk1LjY3LDIyMS42N1pNMzA4LjA3LDQ4Ny44MkMzMTUuOTQsNDgxLjEzIDMyMi44NSw0NzUuMTMgMzI5LjksNDY5LjNDMzQ0LjM5LDQ1Ny4zMSAzNTguOSw0NDUuMzYgMzczLjU0LDQzMy41NkMzNzUuMTcsNDMyLjI1IDM3Ny42OCw0MzEuNCAzNzkuNzksNDMxLjM5QzQxNC43OCw0MzEuMjYgNDQ5Ljc4LDQzMS4zOCA0ODQuNzcsNDMxLjI0QzUwMC4zOSw0MzEuMTggNTE2LjEzLDQzMS43NiA1MzEuNjIsNDMwLjE2QzU3Ni45Miw0MjUuNDkgNjA5LjI0LDM4Ny43NyA2MDguOTUsMzQ0Ljg0QzYwOC42OCwzMDUuNTIgNjA4LjkzLDI2Ni4xOSA2MDguODcsMjI2Ljg2QzYwOC44NywyMjMuMjIgNjA4LjU4LDIxOS41NSA2MDcuOTksMjE1Ljk2QzYwMy4xMSwxODYuMjkgNTg4LjYxLDE2My4zMyA1NjEuMzIsMTQ5LjMyQzU0OS4wNCwxNDMuMDIgNTM2LjE1LDEzOS4yOSA1MjIuMjIsMTM5LjI5QzQ1My45LDEzOS4zMiAzODUuNTgsMTM5LjIgMzE3LjI2LDEzOS4zNUMzMDkuMiwxMzkuMzcgMzAwLjk2LDEzOS44OSAyOTMuMTEsMTQxLjZDMjU0LjE5LDE1MC4wNyAyMjUuMzMsMTg1LjY5IDIyNS4wMywyMjUuNDJDMjI0LjgsMjU2LjA4IDIyNC44NiwyODYuNzQgMjI0Ljk5LDMxNy40QzIyNS4wNSwzMzAuNTMgMjI0Ljc0LDM0My43NiAyMjYuMTgsMzU2Ljc3QzIyOC43NCwzODAuMDUgMjQwLjYsMzk4LjYyIDI1OC43OSw0MTIuOTNDMjczLjA0LDQyNC4xNCAyODkuNjMsNDMwLjAyIDMwNy42MSw0MzEuNTVDMzA3LjgyLDQzMi4wMyAzMDguMDYsNDMyLjMzIDMwOC4wNiw0MzIuNjNDMzA4LjA4LDQ1MC42IDMwOC4wOCw0NjguNTcgMzA4LjA4LDQ4Ny44MUwzMDguMDcsNDg3LjgyWk00MzUuNzksNDMuMzNDNDM1Ljk1LDMzLjQyIDQyNy42MSwyNC42NSA0MTcuOCwyNC40QzQwNi43NiwyNC4xMiAzOTguMjUsMzIuMDUgMzk4LjEzLDQyLjc0QzM5OC4wMSw1My4wNCA0MDYuNiw2Mi4xMiA0MTYuNDIsNjIuMDhDNDI3LjExLDYyLjA0IDQzNS42MSw1My44MSA0MzUuNzgsNDMuMzNMNDM1Ljc5LDQzLjMzWiIgc3R5bGU9ImZpbGw6cmdiKDczLDQ3LDExOCk7ZmlsbC1ydWxlOm5vbnplcm87Ii8+CiAgICAgICAgPHBhdGggZD0iTTQxOS4zLDM5MS42M0MzNzQuNDYsMzkwLjQgMzQxLjUxLDM3Mi42MyAzMTguMDEsMzM3LjcxQzMxNS42NywzMzQuMjMgMzEzLjc3LDMzMC4wNCAzMTMuMSwzMjUuOTVDMzExLjg0LDMxOC4yOCAzMTYuNTMsMzExLjcgMzIzLjcyLDMwOS40NkMzMzAuNjYsMzA3LjI5IDMzOC4zMiwzMTAuMSAzNDEuOTgsMzE3LjAzQzM0OS4xNSwzMzAuNjMgMzU5LjE2LDM0MS4zNSAzNzIuMywzNDkuMzFDNDAxLjMyLDM2Ni44OSA0NDQuNTYsMzYzLjcgNDcwLjYxLDM0Mi4zNUM0NzkuMSwzMzUuMzkgNDg2LjA4LDMyNy40MSA0OTEuNTUsMzE3Ljk3QzQ5NS4wNSwzMTEuOTMgNTAwLjIsMzA4LjE4IDUwNy40NywzMDguOTVDNTEzLjczLDMwOS42MSA1MTguODYsMzEyLjg4IDUyMC4xMiwzMTkuMjFDNTIwLjksMzIzLjEzIDUyMC43MywzMjguMjIgNTE4LjgzLDMzMS41NUM1MDAuNjMsMzYzLjMyIDQ3My41NSwzODIuOTUgNDM3LjI5LDM4OS4zN0M0MzAuNDQsMzkwLjU4IDQyMy40OCwzOTEuMTIgNDE5LjI5LDM5MS42M0w0MTkuMywzOTEuNjNaIiBzdHlsZT0iZmlsbDpyZ2IoMjUwLDEzOSwxKTtmaWxsLXJ1bGU6bm9uemVybzsiLz4KICAgICAgICA8cGF0aCBkPSJNNDYyLjcxLDI0MC4xOUM0NjIuOCwyMTYuOTEgNDgwLjI0LDE5OS43OSA1MDQuMDEsMTk5LjY3QzUyNi41NywxOTkuNTUgNTQ0Ljg5LDIxOC4wNyA1NDQuNTEsMjQxLjM0QzU0NC4xOCwyNjEuODUgNTMwLjA5LDI4MS45NiA1MDEuOTEsMjgxLjIzQzQ4MC42OCwyODAuNjggNDYyLjE1LDI2My44IDQ2Mi43MSwyNDAuMkw0NjIuNzEsMjQwLjE5WiIgc3R5bGU9ImZpbGw6cmdiKDI1MCwxMzksMSk7ZmlsbC1ydWxlOm5vbnplcm87Ii8+CiAgICAgICAgPHBhdGggZD0iTTM3MC45OSwyNDAuMDhDMzcxLDI2Mi43OSAzNTIuNTMsMjgxLjM1IDMyOS44OSwyODEuMzdDMzA3LjA1LDI4MS40IDI4OC45NiwyNjMuNDIgMjg4Ljk2LDI0MC42OEMyODguOTYsMjE4LjE0IDMwNi43MywyMDAgMzI5LjE2LDE5OS42MkMzNTIuMDIsMTk5LjI0IDM3MC45OCwyMTcuNTcgMzcwLjk5LDI0MC4wOFoiIHN0eWxlPSJmaWxsOnJnYigyNTAsMTM5LDEpO2ZpbGwtcnVsZTpub256ZXJvOyIvPgogICAgPC9nPgo8L3N2Zz4K"""
|
| 11 |
+
|
| 12 |
+
MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
|
| 13 |
+
|
| 14 |
+
# Enhanced title with ENHANCED badge
|
| 15 |
+
TITLE = f"""<style>
|
| 16 |
+
.joy-header {{display:flex; align-items:center; justify-content:center;
|
| 17 |
+
gap:16px; margin:4px 0 12px;}}
|
| 18 |
+
.joy-header h1{{margin:0; font-size:1.9rem; line-height:1.2;}}
|
| 19 |
+
.joy-header p {{margin:2px 0 0; font-size:0.9rem; color:#666;}}
|
| 20 |
+
.joy-header img{{height:56px;}}
|
| 21 |
+
.enhanced-badge {{background: linear-gradient(45deg, #ff6b35, #f7931e); color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.7rem; font-weight: bold;}}
|
| 22 |
+
</style>
|
| 23 |
+
|
| 24 |
+
<div class="joy-header">
|
| 25 |
+
<img src="{LOGO_SRC}" alt="JoyCaption logo">
|
| 26 |
+
<div>
|
| 27 |
+
<h1>JoyCaption <span style="font-weight:400">Beta One</span> <span class="enhanced-badge">ENHANCED</span></h1>
|
| 28 |
+
<p>Advanced Image-captioning with keyword injection | build enhanced</p>
|
| 29 |
+
</div>
|
| 30 |
+
</div>
|
| 31 |
+
<hr>"""
|
| 32 |
+
|
| 33 |
+
# Enhanced description
|
| 34 |
+
DESCRIPTION = """
|
| 35 |
+
<div>
|
| 36 |
+
<h2>π Enhanced Features</h2>
|
| 37 |
+
<ul>
|
| 38 |
+
<li><strong>π― Keyword Injection:</strong> Force specific words/phrases to appear in captions</li>
|
| 39 |
+
<li><strong>π Focus Areas:</strong> Emphasize particular aspects of the image</li>
|
| 40 |
+
<li><strong>π¨ Tone Control:</strong> Choose from professional, casual, technical styles</li>
|
| 41 |
+
<li><strong>βοΈ Advanced Prompting:</strong> Enhanced templates for better accuracy</li>
|
| 42 |
+
<li><strong>π Quality Metrics:</strong> Real-time feedback on keyword inclusion</li>
|
| 43 |
+
</ul>
|
| 44 |
+
|
| 45 |
+
<h2>Quick-start</h2>
|
| 46 |
+
<ol>
|
| 47 |
+
<li><strong>Upload</strong> an image in the left panel</li>
|
| 48 |
+
<li><strong>Add Required Keywords</strong> that must appear in the caption (optional)</li>
|
| 49 |
+
<li><strong>Select Focus Areas</strong> to emphasize specific image aspects</li>
|
| 50 |
+
<li><strong>Choose Caption Type</strong> and adjust settings</li>
|
| 51 |
+
<li><strong>Generate</strong> your enhanced caption!</li>
|
| 52 |
+
</ol>
|
| 53 |
+
</div>
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
# Original caption types (preserved)
|
| 57 |
+
ORIGINAL_CAPTION_TYPES = {
|
| 58 |
+
"Descriptive": [
|
| 59 |
+
"Write a detailed description for this image.",
|
| 60 |
+
"Write a detailed description for this image in {word_count} words or less.",
|
| 61 |
+
"Write a {length} detailed description for this image.",
|
| 62 |
+
],
|
| 63 |
+
"Art Critic": [
|
| 64 |
+
"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
|
| 65 |
+
"Analyze this image like an art critic would in {word_count} words.",
|
| 66 |
+
"Analyze this image like an art critic would. Keep it {length}.",
|
| 67 |
+
],
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Enhanced caption types with keyword integration
|
| 71 |
+
ENHANCED_CAPTION_TYPES = {
|
| 72 |
+
"Keyword-Focused Descriptive": [
|
| 73 |
+
"Write a detailed, accurate description of this image. {keyword_instruction} Integrate any specified terms naturally while maintaining descriptive accuracy. Focus on {focus_instruction}",
|
| 74 |
+
"Write a detailed description of this image in {word_count} words or less. {keyword_instruction} {focus_instruction}",
|
| 75 |
+
"Write a {length} detailed description of this image. {keyword_instruction} {focus_instruction}",
|
| 76 |
+
],
|
| 77 |
+
"Enhanced Descriptive": [
|
| 78 |
+
"Provide a comprehensive, detailed description of this image. {keyword_instruction} Pay special attention to {focus_instruction} Use {tone} language throughout.",
|
| 79 |
+
"Describe this image comprehensively in {word_count} words. {keyword_instruction} Focus on {focus_instruction}",
|
| 80 |
+
"Write a {length} comprehensive description. {keyword_instruction} Emphasize {focus_instruction}",
|
| 81 |
+
],
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# Combine caption types
|
| 85 |
+
ALL_CAPTION_TYPES = {**ENHANCED_CAPTION_TYPES, **ORIGINAL_CAPTION_TYPES}
|
| 86 |
+
|
| 87 |
+
# Load model
|
| 88 |
+
print("Loading JoyCaption model...")
|
| 89 |
+
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
| 90 |
+
model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
|
| 91 |
+
model.eval()
|
| 92 |
+
apply_liger_kernel_to_llama(model=model.language_model)
|
| 93 |
+
print("Model loaded successfully!")
|
| 94 |
+
|
| 95 |
+
def build_enhanced_prompt(
|
| 96 |
+
caption_type: str,
|
| 97 |
+
caption_length: str,
|
| 98 |
+
required_keywords: str,
|
| 99 |
+
focus_areas: list,
|
| 100 |
+
tone_style: str
|
| 101 |
+
) -> str:
|
| 102 |
+
"""Build an enhanced prompt with keyword injection and focus areas"""
|
| 103 |
+
|
| 104 |
+
# Choose template index
|
| 105 |
+
if caption_length == "any":
|
| 106 |
+
template_idx = 0
|
| 107 |
+
elif isinstance(caption_length, str) and caption_length.isdigit():
|
| 108 |
+
template_idx = 1
|
| 109 |
+
else:
|
| 110 |
+
template_idx = 2
|
| 111 |
+
|
| 112 |
+
# Get base prompt template
|
| 113 |
+
base_prompt = ALL_CAPTION_TYPES[caption_type][template_idx]
|
| 114 |
+
|
| 115 |
+
# Build keyword instruction
|
| 116 |
+
keyword_instruction = ""
|
| 117 |
+
if required_keywords.strip():
|
| 118 |
+
keywords = [k.strip() for k in required_keywords.split(',') if k.strip()]
|
| 119 |
+
if keywords:
|
| 120 |
+
keyword_instruction = f"REQUIRED KEYWORDS - You MUST naturally include these words/phrases: {', '.join(keywords)}."
|
| 121 |
+
|
| 122 |
+
# Build focus instruction
|
| 123 |
+
focus_instruction = ""
|
| 124 |
+
if focus_areas:
|
| 125 |
+
focus_instruction = f"Pay special attention to: {', '.join(focus_areas)}"
|
| 126 |
+
else:
|
| 127 |
+
focus_instruction = "all visible elements"
|
| 128 |
+
|
| 129 |
+
# Add tone
|
| 130 |
+
tone = tone_style.lower() if tone_style != "Auto-detect" else "appropriate"
|
| 131 |
+
|
| 132 |
+
# Format the prompt
|
| 133 |
+
try:
|
| 134 |
+
formatted_prompt = base_prompt.format(
|
| 135 |
+
keyword_instruction=keyword_instruction,
|
| 136 |
+
focus_instruction=focus_instruction,
|
| 137 |
+
tone=tone,
|
| 138 |
+
length=caption_length,
|
| 139 |
+
word_count=caption_length
|
| 140 |
+
)
|
| 141 |
+
except KeyError:
|
| 142 |
+
# Fallback for original templates that don't have these placeholders
|
| 143 |
+
formatted_prompt = base_prompt.format(
|
| 144 |
+
length=caption_length,
|
| 145 |
+
word_count=caption_length
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
return formatted_prompt
|
| 149 |
+
|
| 150 |
+
def check_keyword_inclusion(caption: str, required_keywords: str) -> str:
|
| 151 |
+
"""Check if required keywords are included in the caption"""
|
| 152 |
+
if not required_keywords.strip():
|
| 153 |
+
return "β
No required keywords specified"
|
| 154 |
+
|
| 155 |
+
keywords = [k.strip().lower() for k in required_keywords.split(',') if k.strip()]
|
| 156 |
+
caption_lower = caption.lower()
|
| 157 |
+
|
| 158 |
+
included = []
|
| 159 |
+
missing = []
|
| 160 |
+
|
| 161 |
+
for keyword in keywords:
|
| 162 |
+
if keyword in caption_lower:
|
| 163 |
+
included.append(keyword)
|
| 164 |
+
else:
|
| 165 |
+
missing.append(keyword)
|
| 166 |
+
|
| 167 |
+
result = f"β
Included ({len(included)}/{len(keywords)}): {', '.join(included) if included else 'None'}"
|
| 168 |
+
if missing:
|
| 169 |
+
result += f"\nβ Missing: {', '.join(missing)}"
|
| 170 |
+
|
| 171 |
+
return result
|
| 172 |
+
|
| 173 |
+
@spaces.GPU()
|
| 174 |
+
@torch.no_grad()
|
| 175 |
+
def chat_joycaption(input_image: Image.Image, prompt: str, temperature: float, top_p: float, max_new_tokens: int) -> Generator[str, None, None]:
|
| 176 |
+
torch.cuda.empty_cache()
|
| 177 |
+
|
| 178 |
+
if input_image is None:
|
| 179 |
+
yield "No image provided. Please upload an image."
|
| 180 |
+
return
|
| 181 |
+
|
| 182 |
+
system_content = "You are a helpful assistant specialized in providing detailed, accurate image descriptions. Focus on being precise, comprehensive, and following instructions exactly."
|
| 183 |
+
|
| 184 |
+
convo = [
|
| 185 |
+
{
|
| 186 |
+
"role": "system",
|
| 187 |
+
"content": system_content,
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"role": "user",
|
| 191 |
+
"content": prompt.strip(),
|
| 192 |
+
},
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 196 |
+
inputs = processor(text=[convo_string], images=[input_image], return_tensors="pt").to('cuda')
|
| 197 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
|
| 198 |
+
|
| 199 |
+
streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
|
| 200 |
+
|
| 201 |
+
generate_kwargs = dict(
|
| 202 |
+
**inputs,
|
| 203 |
+
max_new_tokens=max_new_tokens,
|
| 204 |
+
do_sample=True if temperature > 0 else False,
|
| 205 |
+
suppress_tokens=None,
|
| 206 |
+
use_cache=True,
|
| 207 |
+
temperature=temperature if temperature > 0 else None,
|
| 208 |
+
top_k=None,
|
| 209 |
+
top_p=top_p if temperature > 0 else None,
|
| 210 |
+
streamer=streamer,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 214 |
+
t.start()
|
| 215 |
+
|
| 216 |
+
outputs = []
|
| 217 |
+
for text in streamer:
|
| 218 |
+
outputs.append(text)
|
| 219 |
+
yield "".join(outputs)
|
| 220 |
+
|
| 221 |
+
def generate_with_quality_check(input_image, prompt, temperature, top_p, max_new_tokens, required_keywords):
|
| 222 |
+
"""Generate caption and provide quality feedback"""
|
| 223 |
+
|
| 224 |
+
caption_generator = chat_joycaption(input_image, prompt, temperature, top_p, max_new_tokens)
|
| 225 |
+
|
| 226 |
+
final_caption = ""
|
| 227 |
+
for partial_caption in caption_generator:
|
| 228 |
+
final_caption = partial_caption
|
| 229 |
+
yield partial_caption, ""
|
| 230 |
+
|
| 231 |
+
# Check keyword inclusion
|
| 232 |
+
quality_feedback = check_keyword_inclusion(final_caption, required_keywords)
|
| 233 |
+
yield final_caption, quality_feedback
|
| 234 |
+
|
| 235 |
+
# Create Gradio interface
|
| 236 |
+
with gr.Blocks(title="JoyCaption Enhanced") as demo:
|
| 237 |
+
gr.HTML(TITLE)
|
| 238 |
+
|
| 239 |
+
with gr.Row():
|
| 240 |
+
with gr.Column():
|
| 241 |
+
input_image = gr.Image(type="pil", label="Input Image", height=512, width=512)
|
| 242 |
+
|
| 243 |
+
# Enhanced keyword injection section
|
| 244 |
+
with gr.Group():
|
| 245 |
+
gr.HTML("<h3>π― Keyword Control</h3>")
|
| 246 |
+
required_keywords = gr.Textbox(
|
| 247 |
+
label="Required Keywords/Phrases",
|
| 248 |
+
placeholder="Enter comma-separated words that MUST appear in the caption (e.g., 'sunset, peaceful, golden hour')",
|
| 249 |
+
info="These keywords will be naturally integrated into the description"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
focus_areas = gr.CheckboxGroup(
|
| 253 |
+
choices=[
|
| 254 |
+
"Foreground objects", "Background elements", "Colors and lighting",
|
| 255 |
+
"Textures and materials", "Spatial relationships", "Actions and poses",
|
| 256 |
+
"Facial expressions", "Clothing and accessories", "Artistic style",
|
| 257 |
+
"Composition and framing", "Mood and atmosphere"
|
| 258 |
+
],
|
| 259 |
+
label="Focus Areas (emphasize these aspects)",
|
| 260 |
+
value=["Foreground objects", "Colors and lighting"]
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
tone_style = gr.Dropdown(
|
| 264 |
+
choices=["Auto-detect", "Professional", "Casual", "Technical", "Creative", "Academic", "Marketing"],
|
| 265 |
+
value="Professional",
|
| 266 |
+
label="Description Tone"
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
caption_type = gr.Dropdown(
|
| 270 |
+
choices=list(ALL_CAPTION_TYPES.keys()),
|
| 271 |
+
value="Enhanced Descriptive",
|
| 272 |
+
label="Caption Type",
|
| 273 |
+
info="Enhanced types support keyword injection and focus areas"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
caption_length = gr.Dropdown(
|
| 277 |
+
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
|
| 278 |
+
[str(i) for i in range(20, 261, 10)],
|
| 279 |
+
label="Caption Length",
|
| 280 |
+
value="long",
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
with gr.Accordion("Generation Settings", open=False):
|
| 284 |
+
temperature_slider = gr.Slider(
|
| 285 |
+
minimum=0.0, maximum=2.0, value=0.6, step=0.05,
|
| 286 |
+
label="Temperature",
|
| 287 |
+
info="Higher values make output more creative"
|
| 288 |
+
)
|
| 289 |
+
top_p_slider = gr.Slider(
|
| 290 |
+
minimum=0.0, maximum=1.0, value=0.9, step=0.01,
|
| 291 |
+
label="Top-p"
|
| 292 |
+
)
|
| 293 |
+
max_tokens_slider = gr.Slider(
|
| 294 |
+
minimum=1, maximum=2048, value=512, step=1,
|
| 295 |
+
label="Max New Tokens"
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
with gr.Column():
|
| 299 |
+
prompt_box = gr.Textbox(lines=6, label="Generated Prompt", interactive=True)
|
| 300 |
+
|
| 301 |
+
run_button = gr.Button("π Generate Enhanced Caption", variant="primary", size="lg")
|
| 302 |
+
|
| 303 |
+
output_caption = gr.Textbox(label="Generated Caption", lines=8)
|
| 304 |
+
|
| 305 |
+
quality_feedback = gr.Textbox(
|
| 306 |
+
label="π Keyword Quality Check",
|
| 307 |
+
lines=3
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# Event handlers
|
| 311 |
+
for ctrl in [caption_type, caption_length, required_keywords, focus_areas, tone_style]:
|
| 312 |
+
ctrl.change(
|
| 313 |
+
build_enhanced_prompt,
|
| 314 |
+
inputs=[caption_type, caption_length, required_keywords, focus_areas, tone_style],
|
| 315 |
+
outputs=prompt_box,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
run_button.click(
|
| 319 |
+
generate_with_quality_check,
|
| 320 |
+
inputs=[input_image, prompt_box, temperature_slider, top_p_slider, max_tokens_slider, required_keywords],
|
| 321 |
+
outputs=[output_caption, quality_feedback],
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# Initialize prompt
|
| 325 |
+
demo.load(
|
| 326 |
+
lambda: build_enhanced_prompt("Enhanced Descriptive", "long", "", ["Foreground objects", "Colors and lighting"], "Professional"),
|
| 327 |
+
outputs=prompt_box
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
gr.Markdown(DESCRIPTION)
|
| 331 |
+
|
| 332 |
+
if __name__ == "__main__":
|
| 333 |
+
demo.launch()
|