nickdigger commited on
Commit
793c145
Β·
verified Β·
1 Parent(s): 4b2dace

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +333 -0
app.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
4
+ import torch
5
+ from PIL import Image
6
+ from threading import Thread
7
+ from typing import Generator
8
+ from liger_kernel.transformers import apply_liger_kernel_to_llama
9
+
10
+ LOGO_SRC = """data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiIHN0YW5kYWxvbmU9Im5vIj8+CjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+Cjxzdmcgd2lkdGg9IjEwMCUiIGhlaWdodD0iMTAwJSIgdmlld0JveD0iMCAwIDUzOCA1MzUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSIgeG1sbnM6c2VyaWY9Imh0dHA6Ly93d3cuc2VyaWYuY29tLyIgc3R5bGU9ImZpbGwtcnVsZTpldmVub2RkO2NsaXAtcnVsZTpldmVub2RkO3N0cm9rZS1saW5lam9pbjpyb3VuZDtzdHJva2UtbWl0ZXJsaW1pdDoyOyI+CiAgICA8ZyB0cmFuc2Zvcm09Im1hdHJpeCgxLDAsMCwxLC0xNDcuODcxLDAuMDAxOTA4NjMpIj4KICAgICAgICA8cGF0aCBkPSJNMTk1LjY3LDIyMS42N0MxOTYuNzMsMjA1LjM3IDIwMC4yOCwxODkuNzYgMjA3LjkxLDE3NS4zN0MyMjcuOTgsMTM3LjUxIDI1OS4zMywxMTQuODggMzAyLjAxLDExMS42M0MzMzQuMTUsMTA5LjE4IDM2Ni41OSwxMTAuNiAzOTguODksMTEwLjNDNDAwLjUzLDExMC4yOCA0MDIuMTYsMTEwLjMgNDA0LjQsMTEwLjNDNDA0LjQsMTAxLjk5IDQwNC41Niw5NC4wNSA0MDQuMjMsODYuMTJDNDA0LjE4LDg0Ljg0IDQwMi4xNSw4My4xMyA0MDAuNjYsODIuNDlDMzgzLjIzLDc1LjAyIDM3My4wNSw1OS43OSAzNzMuOTYsNDAuOTZDMzc1LjA5LDE3LjU0IDM5MS40NywyLjY2IDQxMC42NSwwLjM3QzQzNy44OSwtMi44OSA0NTUuNTYsMTUuODQgNDU5LjI2LDM0LjY5QzQ2Mi45Niw1My41NyA0NTIuMTgsNzYuOTMgNDMyLjgxLDgyLjY2QzQzMS42NCw4My4wMSA0MzAuMzMsODUuMjMgNDMwLjI4LDg2LjYyQzQzMC4wMyw5NC4yNiA0MzAuMTYsMTAxLjkyIDQzMC4xNiwxMTAuM0w0MzUuNjMsMTEwLjNDNDYzLjc5LDExMC4zIDQ5MS45NiwxMTAuMjggNTIwLjEyLDExMC4zQzU3NC44NCwxMTAuMzYgNjIzLjA0LDE0OC4zNSA2MzUuNjcsMjAxLjU1QzYzNy4yMywyMDguMTMgNjM3LjgzLDIxNC45MyA2MzguODksMjIxLjY3QzY2MC40MywyMjQuOTQgNjc1LjE5LDIzNi42MiA2ODIuMzYsMjU3LjRDNjgzLjU5LDI2MC45NyA2ODQuNjUsMjY0LjgyIDY4NC42NywyNjguNTRDNjg0Ljc3LDI4My4zNCA2ODUuNzYsMjk4LjMxIDY4My45NCwzMTIuOTFDNjgwLjg5LDMzNy4yOSA2NjIuODYsMzUzLjM2IDYzOC40NywzNTUuODJDNjM1LjE0LDM4NS4wOCA2MjEuOTEsNDA5LjQxIDYwMC40NSw0MjkuMjFDNTgxLjYsNDQ2LjYxIDU1OS4xNCw0NTcuNSA1MzMuNTcsNDU5LjE4QzUwOC4xOCw0NjAuODQgNDgyLjY0LDQ2MC4yIDQ1Ny4xNiw0NjAuMzhDNDM1LjE2LDQ2MC41MyA0MTMuMTcsNDYwLjM0IDM5MS4xNyw0NjAuNTNDMzg4Ljc2LDQ2MC41NSAzODUuOTUsNDYxLjU2IDM4NC4wMyw0NjMuMDRDMzcxLjU0LDQ3Mi42MiAzNTkuMTMsNDgyLjMxIDM0Ni45Miw0OTIuMjVDMzM4Ljk0LDQ5OC43NSAzMzEuMzksNTA1Ljc3IDMyMy41Niw1MTIuNDZDMzE3LjQ1LDUxNy42OCAzMTAuOTMsNTIyLjQ0IDMwNS4xMSw1MjcuOTVDMzAxLjE5LDUzMS42NiAyOTYuNTIsNTMzLjE3IDI5MS42OSw1MzQuMzZDMjg1LjY1LDUzNS44NSAyNzkuMjIsNTI5LjEzIDI3OS4wMSw1MjEuMTlDMjc4LjgsNTEyLjg2IDI3OC45NSw1MDQuNTMgMjc4Ljk0LDQ5Ni4xOUwyNzguOTQsNDU2LjY5QzIzMi44Miw0MzguMTYgMjAzLjU2LDQwNi4yMyAxOTUuMDcsMzU2LjA4QzE5My4yNiwzNTUuNzUgMTkwLjg0LDM1NS40MSAxODguNDgsMzU0Ljg2QzE2Ny40NiwzNDkuOTEgMTU1LjA0LDMzNi4wMiAxNTAuNzIsMzE1LjYyQzE0Ni45OCwyOTcuOTkgMTQ2LjksMjc5LjY3IDE1MC42MSwyNjIuMDlDMTU1LjU1LDIzOC42OCAxNzEuNDIsMjI1LjU5IDE5NS42NiwyMjEuNjdMMTk1LjY3LDIyMS42N1pNMzA4LjA3LDQ4Ny44MkMzMTUuOTQsNDgxLjEzIDMyMi44NSw0NzUuMTMgMzI5LjksNDY5LjNDMzQ0LjM5LDQ1Ny4zMSAzNTguOSw0NDUuMzYgMzczLjU0LDQzMy41NkMzNzUuMTcsNDMyLjI1IDM3Ny42OCw0MzEuNCAzNzkuNzksNDMxLjM5QzQxNC43OCw0MzEuMjYgNDQ5Ljc4LDQzMS4zOCA0ODQuNzcsNDMxLjI0QzUwMC4zOSw0MzEuMTggNTE2LjEzLDQzMS43NiA1MzEuNjIsNDMwLjE2QzU3Ni45Miw0MjUuNDkgNjA5LjI0LDM4Ny43NyA2MDguOTUsMzQ0Ljg0QzYwOC42OCwzMDUuNTIgNjA4LjkzLDI2Ni4xOSA2MDguODcsMjI2Ljg2QzYwOC44NywyMjMuMjIgNjA4LjU4LDIxOS41NSA2MDcuOTksMjE1Ljk2QzYwMy4xMSwxODYuMjkgNTg4LjYxLDE2My4zMyA1NjEuMzIsMTQ5LjMyQzU0OS4wNCwxNDMuMDIgNTM2LjE1LDEzOS4yOSA1MjIuMjIsMTM5LjI5QzQ1My45LDEzOS4zMiAzODUuNTgsMTM5LjIgMzE3LjI2LDEzOS4zNUMzMDkuMiwxMzkuMzcgMzAwLjk2LDEzOS44OSAyOTMuMTEsMTQxLjZDMjU0LjE5LDE1MC4wNyAyMjUuMzMsMTg1LjY5IDIyNS4wMywyMjUuNDJDMjI0LjgsMjU2LjA4IDIyNC44NiwyODYuNzQgMjI0Ljk5LDMxNy40QzIyNS4wNSwzMzAuNTMgMjI0Ljc0LDM0My43NiAyMjYuMTgsMzU2Ljc3QzIyOC43NCwzODAuMDUgMjQwLjYsMzk4LjYyIDI1OC43OSw0MTIuOTNDMjczLjA0LDQyNC4xNCAyODkuNjMsNDMwLjAyIDMwNy42MSw0MzEuNTVDMzA3LjgyLDQzMi4wMyAzMDguMDYsNDMyLjMzIDMwOC4wNiw0MzIuNjNDMzA4LjA4LDQ1MC42IDMwOC4wOCw0NjguNTcgMzA4LjA4LDQ4Ny44MUwzMDguMDcsNDg3LjgyWk00MzUuNzksNDMuMzNDNDM1Ljk1LDMzLjQyIDQyNy42MSwyNC42NSA0MTcuOCwyNC40QzQwNi43NiwyNC4xMiAzOTguMjUsMzIuMDUgMzk4LjEzLDQyLjc0QzM5OC4wMSw1My4wNCA0MDYuNiw2Mi4xMiA0MTYuNDIsNjIuMDhDNDI3LjExLDYyLjA0IDQzNS42MSw1My44MSA0MzUuNzgsNDMuMzNMNDM1Ljc5LDQzLjMzWiIgc3R5bGU9ImZpbGw6cmdiKDczLDQ3LDExOCk7ZmlsbC1ydWxlOm5vbnplcm87Ii8+CiAgICAgICAgPHBhdGggZD0iTTQxOS4zLDM5MS42M0MzNzQuNDYsMzkwLjQgMzQxLjUxLDM3Mi42MyAzMTguMDEsMzM3LjcxQzMxNS42NywzMzQuMjMgMzEzLjc3LDMzMC4wNCAzMTMuMSwzMjUuOTVDMzExLjg0LDMxOC4yOCAzMTYuNTMsMzExLjcgMzIzLjcyLDMwOS40NkMzMzAuNjYsMzA3LjI5IDMzOC4zMiwzMTAuMSAzNDEuOTgsMzE3LjAzQzM0OS4xNSwzMzAuNjMgMzU5LjE2LDM0MS4zNSAzNzIuMywzNDkuMzFDNDAxLjMyLDM2Ni44OSA0NDQuNTYsMzYzLjcgNDcwLjYxLDM0Mi4zNUM0NzkuMSwzMzUuMzkgNDg2LjA4LDMyNy40MSA0OTEuNTUsMzE3Ljk3QzQ5NS4wNSwzMTEuOTMgNTAwLjIsMzA4LjE4IDUwNy40NywzMDguOTVDNTEzLjczLDMwOS42MSA1MTguODYsMzEyLjg4IDUyMC4xMiwzMTkuMjFDNTIwLjksMzIzLjEzIDUyMC43MywzMjguMjIgNTE4LjgzLDMzMS41NUM1MDAuNjMsMzYzLjMyIDQ3My41NSwzODIuOTUgNDM3LjI5LDM4OS4zN0M0MzAuNDQsMzkwLjU4IDQyMy40OCwzOTEuMTIgNDE5LjI5LDM5MS42M0w0MTkuMywzOTEuNjNaIiBzdHlsZT0iZmlsbDpyZ2IoMjUwLDEzOSwxKTtmaWxsLXJ1bGU6bm9uemVybzsiLz4KICAgICAgICA8cGF0aCBkPSJNNDYyLjcxLDI0MC4xOUM0NjIuOCwyMTYuOTEgNDgwLjI0LDE5OS43OSA1MDQuMDEsMTk5LjY3QzUyNi41NywxOTkuNTUgNTQ0Ljg5LDIxOC4wNyA1NDQuNTEsMjQxLjM0QzU0NC4xOCwyNjEuODUgNTMwLjA5LDI4MS45NiA1MDEuOTEsMjgxLjIzQzQ4MC42OCwyODAuNjggNDYyLjE1LDI2My44IDQ2Mi43MSwyNDAuMkw0NjIuNzEsMjQwLjE5WiIgc3R5bGU9ImZpbGw6cmdiKDI1MCwxMzksMSk7ZmlsbC1ydWxlOm5vbnplcm87Ii8+CiAgICAgICAgPHBhdGggZD0iTTM3MC45OSwyNDAuMDhDMzcxLDI2Mi43OSAzNTIuNTMsMjgxLjM1IDMyOS44OSwyODEuMzdDMzA3LjA1LDI4MS40IDI4OC45NiwyNjMuNDIgMjg4Ljk2LDI0MC42OEMyODguOTYsMjE4LjE0IDMwNi43MywyMDAgMzI5LjE2LDE5OS42MkMzNTIuMDIsMTk5LjI0IDM3MC45OCwyMTcuNTcgMzcwLjk5LDI0MC4wOFoiIHN0eWxlPSJmaWxsOnJnYigyNTAsMTM5LDEpO2ZpbGwtcnVsZTpub256ZXJvOyIvPgogICAgPC9nPgo8L3N2Zz4K"""
11
+
12
+ MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
13
+
14
+ # Enhanced title with ENHANCED badge
15
+ TITLE = f"""<style>
16
+ .joy-header {{display:flex; align-items:center; justify-content:center;
17
+ gap:16px; margin:4px 0 12px;}}
18
+ .joy-header h1{{margin:0; font-size:1.9rem; line-height:1.2;}}
19
+ .joy-header p {{margin:2px 0 0; font-size:0.9rem; color:#666;}}
20
+ .joy-header img{{height:56px;}}
21
+ .enhanced-badge {{background: linear-gradient(45deg, #ff6b35, #f7931e); color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.7rem; font-weight: bold;}}
22
+ </style>
23
+
24
+ <div class="joy-header">
25
+ <img src="{LOGO_SRC}" alt="JoyCaption logo">
26
+ <div>
27
+ <h1>JoyCaption <span style="font-weight:400">Beta&nbsp;One</span> <span class="enhanced-badge">ENHANCED</span></h1>
28
+ <p>Advanced Image-captioning with keyword injection &nbsp;|&nbsp; build enhanced</p>
29
+ </div>
30
+ </div>
31
+ <hr>"""
32
+
33
+ # Enhanced description
34
+ DESCRIPTION = """
35
+ <div>
36
+ <h2>πŸš€ Enhanced Features</h2>
37
+ <ul>
38
+ <li><strong>🎯 Keyword Injection:</strong> Force specific words/phrases to appear in captions</li>
39
+ <li><strong>πŸ“‹ Focus Areas:</strong> Emphasize particular aspects of the image</li>
40
+ <li><strong>🎨 Tone Control:</strong> Choose from professional, casual, technical styles</li>
41
+ <li><strong>βš™οΈ Advanced Prompting:</strong> Enhanced templates for better accuracy</li>
42
+ <li><strong>πŸ“Š Quality Metrics:</strong> Real-time feedback on keyword inclusion</li>
43
+ </ul>
44
+
45
+ <h2>Quick-start</h2>
46
+ <ol>
47
+ <li><strong>Upload</strong> an image in the left panel</li>
48
+ <li><strong>Add Required Keywords</strong> that must appear in the caption (optional)</li>
49
+ <li><strong>Select Focus Areas</strong> to emphasize specific image aspects</li>
50
+ <li><strong>Choose Caption Type</strong> and adjust settings</li>
51
+ <li><strong>Generate</strong> your enhanced caption!</li>
52
+ </ol>
53
+ </div>
54
+ """
55
+
56
+ # Original caption types (preserved)
57
+ ORIGINAL_CAPTION_TYPES = {
58
+ "Descriptive": [
59
+ "Write a detailed description for this image.",
60
+ "Write a detailed description for this image in {word_count} words or less.",
61
+ "Write a {length} detailed description for this image.",
62
+ ],
63
+ "Art Critic": [
64
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
65
+ "Analyze this image like an art critic would in {word_count} words.",
66
+ "Analyze this image like an art critic would. Keep it {length}.",
67
+ ],
68
+ }
69
+
70
+ # Enhanced caption types with keyword integration
71
+ ENHANCED_CAPTION_TYPES = {
72
+ "Keyword-Focused Descriptive": [
73
+ "Write a detailed, accurate description of this image. {keyword_instruction} Integrate any specified terms naturally while maintaining descriptive accuracy. Focus on {focus_instruction}",
74
+ "Write a detailed description of this image in {word_count} words or less. {keyword_instruction} {focus_instruction}",
75
+ "Write a {length} detailed description of this image. {keyword_instruction} {focus_instruction}",
76
+ ],
77
+ "Enhanced Descriptive": [
78
+ "Provide a comprehensive, detailed description of this image. {keyword_instruction} Pay special attention to {focus_instruction} Use {tone} language throughout.",
79
+ "Describe this image comprehensively in {word_count} words. {keyword_instruction} Focus on {focus_instruction}",
80
+ "Write a {length} comprehensive description. {keyword_instruction} Emphasize {focus_instruction}",
81
+ ],
82
+ }
83
+
84
+ # Combine caption types
85
+ ALL_CAPTION_TYPES = {**ENHANCED_CAPTION_TYPES, **ORIGINAL_CAPTION_TYPES}
86
+
87
+ # Load model
88
+ print("Loading JoyCaption model...")
89
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
90
+ model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
91
+ model.eval()
92
+ apply_liger_kernel_to_llama(model=model.language_model)
93
+ print("Model loaded successfully!")
94
+
95
+ def build_enhanced_prompt(
96
+ caption_type: str,
97
+ caption_length: str,
98
+ required_keywords: str,
99
+ focus_areas: list,
100
+ tone_style: str
101
+ ) -> str:
102
+ """Build an enhanced prompt with keyword injection and focus areas"""
103
+
104
+ # Choose template index
105
+ if caption_length == "any":
106
+ template_idx = 0
107
+ elif isinstance(caption_length, str) and caption_length.isdigit():
108
+ template_idx = 1
109
+ else:
110
+ template_idx = 2
111
+
112
+ # Get base prompt template
113
+ base_prompt = ALL_CAPTION_TYPES[caption_type][template_idx]
114
+
115
+ # Build keyword instruction
116
+ keyword_instruction = ""
117
+ if required_keywords.strip():
118
+ keywords = [k.strip() for k in required_keywords.split(',') if k.strip()]
119
+ if keywords:
120
+ keyword_instruction = f"REQUIRED KEYWORDS - You MUST naturally include these words/phrases: {', '.join(keywords)}."
121
+
122
+ # Build focus instruction
123
+ focus_instruction = ""
124
+ if focus_areas:
125
+ focus_instruction = f"Pay special attention to: {', '.join(focus_areas)}"
126
+ else:
127
+ focus_instruction = "all visible elements"
128
+
129
+ # Add tone
130
+ tone = tone_style.lower() if tone_style != "Auto-detect" else "appropriate"
131
+
132
+ # Format the prompt
133
+ try:
134
+ formatted_prompt = base_prompt.format(
135
+ keyword_instruction=keyword_instruction,
136
+ focus_instruction=focus_instruction,
137
+ tone=tone,
138
+ length=caption_length,
139
+ word_count=caption_length
140
+ )
141
+ except KeyError:
142
+ # Fallback for original templates that don't have these placeholders
143
+ formatted_prompt = base_prompt.format(
144
+ length=caption_length,
145
+ word_count=caption_length
146
+ )
147
+
148
+ return formatted_prompt
149
+
150
+ def check_keyword_inclusion(caption: str, required_keywords: str) -> str:
151
+ """Check if required keywords are included in the caption"""
152
+ if not required_keywords.strip():
153
+ return "βœ… No required keywords specified"
154
+
155
+ keywords = [k.strip().lower() for k in required_keywords.split(',') if k.strip()]
156
+ caption_lower = caption.lower()
157
+
158
+ included = []
159
+ missing = []
160
+
161
+ for keyword in keywords:
162
+ if keyword in caption_lower:
163
+ included.append(keyword)
164
+ else:
165
+ missing.append(keyword)
166
+
167
+ result = f"βœ… Included ({len(included)}/{len(keywords)}): {', '.join(included) if included else 'None'}"
168
+ if missing:
169
+ result += f"\n❌ Missing: {', '.join(missing)}"
170
+
171
+ return result
172
+
173
+ @spaces.GPU()
174
+ @torch.no_grad()
175
+ def chat_joycaption(input_image: Image.Image, prompt: str, temperature: float, top_p: float, max_new_tokens: int) -> Generator[str, None, None]:
176
+ torch.cuda.empty_cache()
177
+
178
+ if input_image is None:
179
+ yield "No image provided. Please upload an image."
180
+ return
181
+
182
+ system_content = "You are a helpful assistant specialized in providing detailed, accurate image descriptions. Focus on being precise, comprehensive, and following instructions exactly."
183
+
184
+ convo = [
185
+ {
186
+ "role": "system",
187
+ "content": system_content,
188
+ },
189
+ {
190
+ "role": "user",
191
+ "content": prompt.strip(),
192
+ },
193
+ ]
194
+
195
+ convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
196
+ inputs = processor(text=[convo_string], images=[input_image], return_tensors="pt").to('cuda')
197
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
198
+
199
+ streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
200
+
201
+ generate_kwargs = dict(
202
+ **inputs,
203
+ max_new_tokens=max_new_tokens,
204
+ do_sample=True if temperature > 0 else False,
205
+ suppress_tokens=None,
206
+ use_cache=True,
207
+ temperature=temperature if temperature > 0 else None,
208
+ top_k=None,
209
+ top_p=top_p if temperature > 0 else None,
210
+ streamer=streamer,
211
+ )
212
+
213
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
214
+ t.start()
215
+
216
+ outputs = []
217
+ for text in streamer:
218
+ outputs.append(text)
219
+ yield "".join(outputs)
220
+
221
+ def generate_with_quality_check(input_image, prompt, temperature, top_p, max_new_tokens, required_keywords):
222
+ """Generate caption and provide quality feedback"""
223
+
224
+ caption_generator = chat_joycaption(input_image, prompt, temperature, top_p, max_new_tokens)
225
+
226
+ final_caption = ""
227
+ for partial_caption in caption_generator:
228
+ final_caption = partial_caption
229
+ yield partial_caption, ""
230
+
231
+ # Check keyword inclusion
232
+ quality_feedback = check_keyword_inclusion(final_caption, required_keywords)
233
+ yield final_caption, quality_feedback
234
+
235
+ # Create Gradio interface
236
+ with gr.Blocks(title="JoyCaption Enhanced") as demo:
237
+ gr.HTML(TITLE)
238
+
239
+ with gr.Row():
240
+ with gr.Column():
241
+ input_image = gr.Image(type="pil", label="Input Image", height=512, width=512)
242
+
243
+ # Enhanced keyword injection section
244
+ with gr.Group():
245
+ gr.HTML("<h3>🎯 Keyword Control</h3>")
246
+ required_keywords = gr.Textbox(
247
+ label="Required Keywords/Phrases",
248
+ placeholder="Enter comma-separated words that MUST appear in the caption (e.g., 'sunset, peaceful, golden hour')",
249
+ info="These keywords will be naturally integrated into the description"
250
+ )
251
+
252
+ focus_areas = gr.CheckboxGroup(
253
+ choices=[
254
+ "Foreground objects", "Background elements", "Colors and lighting",
255
+ "Textures and materials", "Spatial relationships", "Actions and poses",
256
+ "Facial expressions", "Clothing and accessories", "Artistic style",
257
+ "Composition and framing", "Mood and atmosphere"
258
+ ],
259
+ label="Focus Areas (emphasize these aspects)",
260
+ value=["Foreground objects", "Colors and lighting"]
261
+ )
262
+
263
+ tone_style = gr.Dropdown(
264
+ choices=["Auto-detect", "Professional", "Casual", "Technical", "Creative", "Academic", "Marketing"],
265
+ value="Professional",
266
+ label="Description Tone"
267
+ )
268
+
269
+ caption_type = gr.Dropdown(
270
+ choices=list(ALL_CAPTION_TYPES.keys()),
271
+ value="Enhanced Descriptive",
272
+ label="Caption Type",
273
+ info="Enhanced types support keyword injection and focus areas"
274
+ )
275
+
276
+ caption_length = gr.Dropdown(
277
+ choices=["any", "very short", "short", "medium-length", "long", "very long"] +
278
+ [str(i) for i in range(20, 261, 10)],
279
+ label="Caption Length",
280
+ value="long",
281
+ )
282
+
283
+ with gr.Accordion("Generation Settings", open=False):
284
+ temperature_slider = gr.Slider(
285
+ minimum=0.0, maximum=2.0, value=0.6, step=0.05,
286
+ label="Temperature",
287
+ info="Higher values make output more creative"
288
+ )
289
+ top_p_slider = gr.Slider(
290
+ minimum=0.0, maximum=1.0, value=0.9, step=0.01,
291
+ label="Top-p"
292
+ )
293
+ max_tokens_slider = gr.Slider(
294
+ minimum=1, maximum=2048, value=512, step=1,
295
+ label="Max New Tokens"
296
+ )
297
+
298
+ with gr.Column():
299
+ prompt_box = gr.Textbox(lines=6, label="Generated Prompt", interactive=True)
300
+
301
+ run_button = gr.Button("πŸš€ Generate Enhanced Caption", variant="primary", size="lg")
302
+
303
+ output_caption = gr.Textbox(label="Generated Caption", lines=8)
304
+
305
+ quality_feedback = gr.Textbox(
306
+ label="πŸ“Š Keyword Quality Check",
307
+ lines=3
308
+ )
309
+
310
+ # Event handlers
311
+ for ctrl in [caption_type, caption_length, required_keywords, focus_areas, tone_style]:
312
+ ctrl.change(
313
+ build_enhanced_prompt,
314
+ inputs=[caption_type, caption_length, required_keywords, focus_areas, tone_style],
315
+ outputs=prompt_box,
316
+ )
317
+
318
+ run_button.click(
319
+ generate_with_quality_check,
320
+ inputs=[input_image, prompt_box, temperature_slider, top_p_slider, max_tokens_slider, required_keywords],
321
+ outputs=[output_caption, quality_feedback],
322
+ )
323
+
324
+ # Initialize prompt
325
+ demo.load(
326
+ lambda: build_enhanced_prompt("Enhanced Descriptive", "long", "", ["Foreground objects", "Colors and lighting"], "Professional"),
327
+ outputs=prompt_box
328
+ )
329
+
330
+ gr.Markdown(DESCRIPTION)
331
+
332
+ if __name__ == "__main__":
333
+ demo.launch()