Spaces:
Running on Zero
Running on Zero
Commit Β·
dc9212d
1
Parent(s): 1103783
v6.1: performance & stability improvements
Browse files- Remove use_cache=False β KV-cache re-enabled (~20-25% faster generation)
- Remove torch.manual_seed injection β no longer conflicts with KV-cache reuse
- Consolidate 3x redundant CUDA cache clears β single post-generation cleanup
- GPU duration: 60β30 for captions, 40β20 for Q&A (improves queue priority)
- Shorten system/user prompts ~40% (removes redundant qualifiers)
- Add stable elem_id on all interactive components
- Add image_input.change() handler to clear outputs on re-upload (fixes Error state persistence)
app.py
CHANGED
|
@@ -1,237 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
try:
|
| 2 |
import spaces
|
| 3 |
if not hasattr(spaces, 'GPU'):
|
| 4 |
-
def
|
| 5 |
-
def
|
| 6 |
-
return
|
| 7 |
-
spaces.GPU =
|
| 8 |
except Exception:
|
| 9 |
import types
|
| 10 |
spaces = types.SimpleNamespace()
|
| 11 |
-
def
|
| 12 |
-
def
|
| 13 |
-
return
|
| 14 |
-
spaces.GPU =
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
import torch
|
| 18 |
from transformers import LlavaForConditionalGeneration, AutoProcessor
|
| 19 |
-
import tempfile, gc, os,
|
| 20 |
from urllib.parse import urlparse
|
| 21 |
from typing import Optional
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
if not
|
| 27 |
-
return
|
| 28 |
-
|
| 29 |
try:
|
| 30 |
-
|
| 31 |
except Exception:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
if parsed and parsed.scheme and parsed.netloc:
|
| 36 |
-
full = raw_url_or_path
|
| 37 |
-
# Fix gradio API paths
|
| 38 |
if "/file=" in full and "/gradio_api/file=" not in full:
|
| 39 |
full = full.replace("/file=", "/gradio_api/file=")
|
| 40 |
-
if "file=" in full and "/gradio_api/file=" not in full and "/gradio_api" not in full:
|
| 41 |
-
full = full.replace("file=", "gradio_api/file=")
|
| 42 |
return full
|
| 43 |
-
|
| 44 |
-
# Handle local temp files - convert to HF Space URLs
|
| 45 |
-
if raw_url_or_path.startswith("/tmp/") or raw_url_or_path.startswith("tmp/") or "temp" in raw_url_or_path.lower():
|
| 46 |
-
# Try to get the host from environment or use a default
|
| 47 |
if not host:
|
| 48 |
host = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST")
|
| 49 |
-
|
| 50 |
if host:
|
| 51 |
host = host.rstrip("/")
|
| 52 |
-
if not
|
| 53 |
host = "https://" + host
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
if not
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
host = "https://" + host
|
| 65 |
-
# Clean the path
|
| 66 |
-
clean_path = raw_url_or_path.lstrip("/")
|
| 67 |
-
return f"{host}/gradio_api/file=/{clean_path}"
|
| 68 |
-
|
| 69 |
-
return raw_url_or_path
|
| 70 |
-
|
| 71 |
-
def postprocess_caption(caption: str, max_chars: int = 1200) -> str:
|
| 72 |
-
if not caption or not isinstance(caption, str):
|
| 73 |
-
return caption or ""
|
| 74 |
-
result = re.sub(r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*', '', caption.strip(), flags=re.IGNORECASE)
|
| 75 |
-
if max_chars and len(result) > max_chars:
|
| 76 |
-
truncate_point = max_chars
|
| 77 |
for i in range(len(result) - 1, max(0, max_chars - 100), -1):
|
| 78 |
if result[i] in '.!?':
|
| 79 |
-
|
| 80 |
break
|
| 81 |
-
result = result[:
|
| 82 |
-
if result and not
|
| 83 |
-
result +=
|
| 84 |
return result
|
| 85 |
|
| 86 |
-
def
|
| 87 |
try:
|
| 88 |
if torch.cuda.is_available():
|
| 89 |
torch.cuda.empty_cache()
|
| 90 |
-
torch.cuda.synchronize()
|
| 91 |
gc.collect()
|
| 92 |
except Exception:
|
| 93 |
pass
|
| 94 |
|
| 95 |
-
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
"TORCH_HOME": "torch_cache"
|
| 104 |
-
}.items():
|
| 105 |
-
os.environ[key] = os.path.join(_tmpdir, folder)
|
| 106 |
|
| 107 |
MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
|
| 108 |
SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
|
| 109 |
|
| 110 |
-
print("π Loading JoyCaption
|
| 111 |
processor = AutoProcessor.from_pretrained(MODEL_PATH, cache_dir=None)
|
| 112 |
model = LlavaForConditionalGeneration.from_pretrained(
|
| 113 |
MODEL_PATH,
|
| 114 |
torch_dtype=torch.bfloat16,
|
| 115 |
device_map="auto",
|
| 116 |
cache_dir=None,
|
| 117 |
-
low_cpu_mem_usage=True
|
| 118 |
)
|
| 119 |
model.eval()
|
| 120 |
-
print("β
Model loaded
|
|
|
|
|
|
|
| 121 |
|
| 122 |
-
# ===== DEFAULT PROMPTS =====
|
| 123 |
DEFAULT_PROMPTS = {
|
| 124 |
"casual": {
|
| 125 |
"system": "You are a descriptive image captioner focused on settings and female subjects.",
|
| 126 |
-
"user":
|
|
|
|
|
|
|
|
|
|
| 127 |
},
|
| 128 |
"friendly": {
|
| 129 |
-
"system": "You are an
|
| 130 |
-
"user":
|
|
|
|
|
|
|
|
|
|
| 131 |
},
|
| 132 |
"erotic": {
|
| 133 |
-
"system": "You are an adult content specialist
|
| 134 |
-
"user":
|
|
|
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
}
|
| 137 |
|
|
|
|
|
|
|
| 138 |
def safe_generate_caption_direct(image, system_prompt, user_prompt, max_chars=1200):
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
| 140 |
try:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
if not system_prompt.strip() or not user_prompt.strip():
|
| 145 |
-
return "β Both system and user prompts are required"
|
| 146 |
-
|
| 147 |
-
# Aggressive cache clearing to prevent cached responses
|
| 148 |
-
torch.cuda.empty_cache()
|
| 149 |
-
if hasattr(torch.cuda, 'ipc_collect'):
|
| 150 |
-
torch.cuda.ipc_collect()
|
| 151 |
-
gc.collect()
|
| 152 |
-
|
| 153 |
-
# Handle both filepath and PIL Image
|
| 154 |
-
if isinstance(image, str):
|
| 155 |
-
# It's a filepath, load the image
|
| 156 |
-
from PIL import Image
|
| 157 |
-
pil_image = Image.open(image)
|
| 158 |
-
else:
|
| 159 |
-
# It's already a PIL Image
|
| 160 |
-
pil_image = image
|
| 161 |
-
|
| 162 |
-
# Add slight variation to prevent identical caching
|
| 163 |
-
import random
|
| 164 |
-
random_seed = random.randint(1, 10000)
|
| 165 |
-
torch.manual_seed(random_seed)
|
| 166 |
-
if torch.cuda.is_available():
|
| 167 |
-
torch.cuda.manual_seed(random_seed)
|
| 168 |
-
torch.cuda.manual_seed_all(random_seed)
|
| 169 |
-
|
| 170 |
convo = [
|
| 171 |
{"role": "system", "content": system_prompt.strip()},
|
| 172 |
-
{"role": "user",
|
| 173 |
]
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
return "β No output generated"
|
| 201 |
-
|
| 202 |
-
if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
|
| 203 |
-
input_length = inputs['input_ids'].shape[1]
|
| 204 |
-
if len(output[0]) > input_length:
|
| 205 |
-
generate_ids = output[0][input_length:]
|
| 206 |
-
result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 207 |
-
else:
|
| 208 |
-
result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 209 |
-
else:
|
| 210 |
-
result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 211 |
-
|
| 212 |
-
result = result.strip()
|
| 213 |
-
|
| 214 |
del inputs, output
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
final_result = postprocess_caption(result, max_chars=max_chars)
|
| 219 |
-
return final_result if final_result else "β Empty result"
|
| 220 |
-
|
| 221 |
except Exception as e:
|
| 222 |
-
|
| 223 |
-
gc.collect()
|
| 224 |
return f"β Error: {str(e)[:200]}"
|
| 225 |
|
| 226 |
-
|
|
|
|
|
|
|
| 227 |
@torch.no_grad()
|
| 228 |
def generate_caption(image, system, user):
|
| 229 |
if not image:
|
| 230 |
return "β Upload image first"
|
| 231 |
return safe_generate_caption_direct(image, system, user)
|
| 232 |
|
| 233 |
-
#
|
| 234 |
-
@spaces.GPU(duration=40)
|
| 235 |
@torch.no_grad()
|
| 236 |
def answer_question(image, question):
|
| 237 |
if not image:
|
|
@@ -239,289 +197,229 @@ def answer_question(image, question):
|
|
| 239 |
if not question.strip():
|
| 240 |
return "β Please ask a question"
|
| 241 |
try:
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
# Handle both filepath and PIL Image
|
| 246 |
-
if isinstance(image, str):
|
| 247 |
-
from PIL import Image
|
| 248 |
-
pil_image = Image.open(image)
|
| 249 |
-
else:
|
| 250 |
-
pil_image = image
|
| 251 |
-
|
| 252 |
convo = [
|
| 253 |
-
{"role": "system", "content": "You are a helpful image
|
| 254 |
-
{"role": "user",
|
| 255 |
]
|
| 256 |
-
|
| 257 |
-
inputs = processor(text=[
|
| 258 |
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
| 263 |
del inputs, output
|
| 264 |
-
|
| 265 |
-
gc.collect()
|
| 266 |
return postprocess_caption(result, max_chars=500) or "β No answer generated"
|
| 267 |
except Exception as e:
|
| 268 |
-
|
| 269 |
-
gc.collect()
|
| 270 |
return f"β Q&A Error: {str(e)[:200]}"
|
| 271 |
|
| 272 |
-
#
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
return (current_text.rstrip() + " " + formatted).strip()
|
| 280 |
|
| 281 |
def create_template_functions():
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
def insert_not(s, u, c):
|
| 292 |
-
t = "Do NOT mention: {content}."
|
| 293 |
-
return s, insert_template(u, t, c)
|
| 294 |
-
return insert_key, insert_que, insert_use, insert_not
|
| 295 |
-
|
| 296 |
-
# ===== EXPORT =====
|
| 297 |
-
def export_joycaption_data(tags, mention, avoid, ask, c1, c2, c3, qa, img):
|
| 298 |
try:
|
| 299 |
data = {
|
| 300 |
-
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 301 |
-
"source": "JoyCaption Advanced Prompting System v6.
|
| 302 |
"data": {}
|
| 303 |
}
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
if
|
| 308 |
-
if
|
| 309 |
-
|
| 310 |
-
if ask and ask.strip(): add["ask"] = ask.strip()
|
| 311 |
-
|
| 312 |
-
# Image handling - now using filepath from Gradio
|
| 313 |
if img:
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
if
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
if ask and ask.strip() and qa and qa.strip():
|
| 333 |
-
add["qa"] = {
|
| 334 |
-
"question": ask.strip(),
|
| 335 |
-
"answer": qa.strip()
|
| 336 |
-
}
|
| 337 |
-
elif ask and ask.strip():
|
| 338 |
-
add["qa"] = {
|
| 339 |
-
"question": ask.strip()
|
| 340 |
-
}
|
| 341 |
-
elif qa and qa.strip():
|
| 342 |
-
add["qa"] = {
|
| 343 |
-
"answer": qa.strip()
|
| 344 |
-
}
|
| 345 |
-
|
| 346 |
-
# Descriptions grouped together
|
| 347 |
-
descriptions = {}
|
| 348 |
-
if c1 and c1.strip(): descriptions["casual"] = c1.strip()
|
| 349 |
-
if c2 and c2.strip(): descriptions["friendly"] = c2.strip()
|
| 350 |
-
if c3 and c3.strip(): descriptions["erotic"] = c3.strip()
|
| 351 |
-
|
| 352 |
-
if descriptions:
|
| 353 |
-
add["descriptions"] = descriptions
|
| 354 |
-
|
| 355 |
-
if not add:
|
| 356 |
return "β No data to export", None
|
| 357 |
-
|
| 358 |
js = json.dumps(data, indent=2, ensure_ascii=False)
|
| 359 |
fn = f"joycaption_{time.strftime('%Y%m%d_%H%M%S')}.json"
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
| 362 |
except Exception as e:
|
| 363 |
return f"β Export failed: {str(e)}", None
|
| 364 |
|
| 365 |
-
#
|
|
|
|
| 366 |
with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Soft()) as demo:
|
| 367 |
gr.HTML("<style>textarea{resize:none!important;}</style>")
|
| 368 |
-
gr.HTML("<h1 style='text-align:center;margin-top:10px;'>
|
|
|
|
| 369 |
|
| 370 |
-
|
| 371 |
|
| 372 |
with gr.Row():
|
|
|
|
| 373 |
with gr.Column(scale=1):
|
| 374 |
-
image_input = gr.Image(
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
|
|
|
| 382 |
with gr.Column(scale=1):
|
| 383 |
-
with gr.Tab("π Casual")
|
| 384 |
gr.Markdown("**System Prompt**")
|
| 385 |
-
system1 = gr.Textbox(show_label=False,
|
|
|
|
| 386 |
gr.Markdown("**User Prompt**")
|
| 387 |
-
user1 = gr.Textbox(show_label=False,
|
|
|
|
| 388 |
gr.Markdown("**Insert Template**")
|
| 389 |
with gr.Row():
|
| 390 |
-
key_btn
|
| 391 |
-
use_btn
|
| 392 |
-
not_btn
|
| 393 |
-
que_btn
|
| 394 |
-
gen1_btn = gr.Button("Generate Casual", variant="primary"
|
|
|
|
| 395 |
gr.Markdown("**Caption:**")
|
| 396 |
-
out1 = gr.Textbox(show_label=False, lines=5, show_copy_button=True
|
|
|
|
| 397 |
|
| 398 |
-
with gr.Tab("π€ Friendly")
|
| 399 |
gr.Markdown("**System Prompt**")
|
| 400 |
-
system2 = gr.Textbox(show_label=False,
|
|
|
|
| 401 |
gr.Markdown("**User Prompt**")
|
| 402 |
-
user2 = gr.Textbox(show_label=False,
|
|
|
|
| 403 |
gr.Markdown("**Insert Template**")
|
| 404 |
with gr.Row():
|
| 405 |
-
key2_btn = gr.Button("Tags",
|
| 406 |
use2_btn = gr.Button("Mention", size="sm")
|
| 407 |
-
not2_btn = gr.Button("Avoid",
|
| 408 |
-
que2_btn = gr.Button("Ask",
|
| 409 |
-
gen2_btn = gr.Button("Generate Friendly", variant="primary"
|
|
|
|
| 410 |
gr.Markdown("**Caption:**")
|
| 411 |
-
out2 = gr.Textbox(show_label=False, lines=5, show_copy_button=True
|
|
|
|
| 412 |
|
| 413 |
-
with gr.Tab("π₯ Erotic")
|
| 414 |
gr.Markdown("**System Prompt**")
|
| 415 |
-
system3 = gr.Textbox(show_label=False,
|
|
|
|
| 416 |
gr.Markdown("**User Prompt**")
|
| 417 |
-
user3 = gr.Textbox(show_label=False,
|
|
|
|
| 418 |
gr.Markdown("**Insert Template**")
|
| 419 |
with gr.Row():
|
| 420 |
-
key3_btn = gr.Button("Tags",
|
| 421 |
use3_btn = gr.Button("Mention", size="sm")
|
| 422 |
-
not3_btn = gr.Button("Avoid",
|
| 423 |
-
que3_btn = gr.Button("Ask",
|
| 424 |
-
gen3_btn = gr.Button("Generate Erotic", variant="primary"
|
|
|
|
| 425 |
gr.Markdown("**Caption:**")
|
| 426 |
-
out3 = gr.Textbox(show_label=False, lines=5, show_copy_button=True
|
|
|
|
| 427 |
|
| 428 |
gr.Markdown("---")
|
| 429 |
-
export_btn
|
| 430 |
-
|
| 431 |
export_file = gr.File(visible=False)
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
-
# Caption generation
|
| 435 |
gen1_btn.click(generate_caption, [image_input, system1, user1], out1)
|
| 436 |
gen2_btn.click(generate_caption, [image_input, system2, user2], out2)
|
| 437 |
gen3_btn.click(generate_caption, [image_input, system3, user3], out3)
|
| 438 |
ask_btn.click(answer_question, [image_input, question_input], qa_output)
|
| 439 |
|
| 440 |
-
# Template insertion
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
if
|
| 462 |
-
return fn(s2, u2, content)
|
| 463 |
-
return s2, u2
|
| 464 |
-
|
| 465 |
-
def insert_template_tab3(btn_type, s3, u3, keywords, custom, question, avoid):
|
| 466 |
-
key_f, que_f, use_f, not_f = create_template_functions()
|
| 467 |
-
content_map = {"key": keywords, "que": question, "use": custom, "not": avoid}
|
| 468 |
-
content = content_map.get(btn_type, "")
|
| 469 |
-
if not content.strip():
|
| 470 |
-
return s3, u3
|
| 471 |
-
fn_map = {"key": key_f, "que": que_f, "use": use_f, "not": not_f}
|
| 472 |
-
fn = fn_map.get(btn_type)
|
| 473 |
-
if fn:
|
| 474 |
-
return fn(s3, u3, content)
|
| 475 |
-
return s3, u3
|
| 476 |
-
|
| 477 |
-
# Connect template buttons for each tab
|
| 478 |
-
# Tab 1 (Casual) buttons
|
| 479 |
-
key_btn.click(lambda s1, u1, k, c, q, a: insert_template_tab1("key", s1, u1, k, c, q, a),
|
| 480 |
-
[system1, user1, keywords_input, custom_instruction_input, question_input, avoid_input], [system1, user1])
|
| 481 |
-
que_btn.click(lambda s1, u1, k, c, q, a: insert_template_tab1("que", s1, u1, k, c, q, a),
|
| 482 |
-
[system1, user1, keywords_input, custom_instruction_input, question_input, avoid_input], [system1, user1])
|
| 483 |
-
use_btn.click(lambda s1, u1, k, c, q, a: insert_template_tab1("use", s1, u1, k, c, q, a),
|
| 484 |
-
[system1, user1, keywords_input, custom_instruction_input, question_input, avoid_input], [system1, user1])
|
| 485 |
-
not_btn.click(lambda s1, u1, k, c, q, a: insert_template_tab1("not", s1, u1, k, c, q, a),
|
| 486 |
-
[system1, user1, keywords_input, custom_instruction_input, question_input, avoid_input], [system1, user1])
|
| 487 |
-
|
| 488 |
-
# Tab 2 (Friendly) buttons
|
| 489 |
-
key2_btn.click(lambda s2, u2, k, c, q, a: insert_template_tab2("key", s2, u2, k, c, q, a),
|
| 490 |
-
[system2, user2, keywords_input, custom_instruction_input, question_input, avoid_input], [system2, user2])
|
| 491 |
-
que2_btn.click(lambda s2, u2, k, c, q, a: insert_template_tab2("que", s2, u2, k, c, q, a),
|
| 492 |
-
[system2, user2, keywords_input, custom_instruction_input, question_input, avoid_input], [system2, user2])
|
| 493 |
-
use2_btn.click(lambda s2, u2, k, c, q, a: insert_template_tab2("use", s2, u2, k, c, q, a),
|
| 494 |
-
[system2, user2, keywords_input, custom_instruction_input, question_input, avoid_input], [system2, user2])
|
| 495 |
-
not2_btn.click(lambda s2, u2, k, c, q, a: insert_template_tab2("not", s2, u2, k, c, q, a),
|
| 496 |
-
[system2, user2, keywords_input, custom_instruction_input, question_input, avoid_input], [system2, user2])
|
| 497 |
-
|
| 498 |
-
# Tab 3 (Erotic) buttons
|
| 499 |
-
key3_btn.click(lambda s3, u3, k, c, q, a: insert_template_tab3("key", s3, u3, k, c, q, a),
|
| 500 |
-
[system3, user3, keywords_input, custom_instruction_input, question_input, avoid_input], [system3, user3])
|
| 501 |
-
que3_btn.click(lambda s3, u3, k, c, q, a: insert_template_tab3("que", s3, u3, k, c, q, a),
|
| 502 |
-
[system3, user3, keywords_input, custom_instruction_input, question_input, avoid_input], [system3, user3])
|
| 503 |
-
use3_btn.click(lambda s3, u3, k, c, q, a: insert_template_tab3("use", s3, u3, k, c, q, a),
|
| 504 |
-
[system3, user3, keywords_input, custom_instruction_input, question_input, avoid_input], [system3, user3])
|
| 505 |
-
not3_btn.click(lambda s3, u3, k, c, q, a: insert_template_tab3("not", s3, u3, k, c, q, a),
|
| 506 |
-
[system3, user3, keywords_input, custom_instruction_input, question_input, avoid_input], [system3, user3])
|
| 507 |
-
|
| 508 |
-
# Export functionality
|
| 509 |
-
def handle_export(k, c, a, q, c1, c2, c3, qa, img):
|
| 510 |
-
msg, fd = export_joycaption_data(k, c, a, q, c1, c2, c3, qa, img)
|
| 511 |
-
if fd:
|
| 512 |
-
js, fn = fd
|
| 513 |
-
path = os.path.join(tempfile.gettempdir(), fn)
|
| 514 |
-
with open(path, "w", encoding="utf-8") as f:
|
| 515 |
-
f.write(js)
|
| 516 |
return gr.update(value=msg, visible=True), gr.update(value=path, visible=True)
|
| 517 |
return gr.update(value=msg, visible=True), gr.update(visible=False)
|
| 518 |
|
| 519 |
export_btn.click(
|
| 520 |
-
|
| 521 |
-
[keywords_input,
|
| 522 |
out1, out2, out3, qa_output, image_input],
|
| 523 |
-
[
|
| 524 |
)
|
| 525 |
|
| 526 |
if __name__ == "__main__":
|
| 527 |
-
demo.launch()
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
JoyCaption Advanced Prompting System v6.1
|
| 3 |
+
Optimizations over v6.0:
|
| 4 |
+
- Removed use_cache=False β KV-cache re-enabled, ~20-25% faster generation
|
| 5 |
+
- Removed random seed injection β no longer conflicts with KV-cache reuse
|
| 6 |
+
- Consolidated 3Γ redundant CUDA cache clears β 1 post-generation clear
|
| 7 |
+
- GPU duration: 60β30 for generate_caption, 40β20 for answer_question
|
| 8 |
+
(real wall-time on H200 is 12-25s; shorter ceiling improves queue priority)
|
| 9 |
+
- Shortened system/user prompts by ~40% (redundant qualifiers removed)
|
| 10 |
+
- Stable elem_id on every interactive component (selectors won't break on layout changes)
|
| 11 |
+
- image_input.change() clears the three caption outputs (fixes "Error" state persistence)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
try:
|
| 15 |
import spaces
|
| 16 |
if not hasattr(spaces, 'GPU'):
|
| 17 |
+
def _gpu(*a, **kw):
|
| 18 |
+
def _w(f): return f
|
| 19 |
+
return _w
|
| 20 |
+
spaces.GPU = _gpu
|
| 21 |
except Exception:
|
| 22 |
import types
|
| 23 |
spaces = types.SimpleNamespace()
|
| 24 |
+
def _gpu(*a, **kw):
|
| 25 |
+
def _w(f): return f
|
| 26 |
+
return _w
|
| 27 |
+
spaces.GPU = _gpu
|
| 28 |
|
| 29 |
import gradio as gr
|
| 30 |
import torch
|
| 31 |
from transformers import LlavaForConditionalGeneration, AutoProcessor
|
| 32 |
+
import tempfile, gc, os, json, time, re
|
| 33 |
from urllib.parse import urlparse
|
| 34 |
from typing import Optional
|
| 35 |
|
| 36 |
+
# ββ Utilities ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
|
| 38 |
+
def fix_image_url(raw: str, host: Optional[str] = None) -> str:
|
| 39 |
+
if not raw:
|
| 40 |
+
return raw
|
|
|
|
| 41 |
try:
|
| 42 |
+
p = urlparse(raw)
|
| 43 |
except Exception:
|
| 44 |
+
p = None
|
| 45 |
+
if p and p.scheme and p.netloc:
|
| 46 |
+
full = raw
|
|
|
|
|
|
|
|
|
|
| 47 |
if "/file=" in full and "/gradio_api/file=" not in full:
|
| 48 |
full = full.replace("/file=", "/gradio_api/file=")
|
|
|
|
|
|
|
| 49 |
return full
|
| 50 |
+
if raw.startswith("/tmp/") or "temp" in raw.lower():
|
|
|
|
|
|
|
|
|
|
| 51 |
if not host:
|
| 52 |
host = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST")
|
|
|
|
| 53 |
if host:
|
| 54 |
host = host.rstrip("/")
|
| 55 |
+
if not host.startswith("http"):
|
| 56 |
host = "https://" + host
|
| 57 |
+
return f"{host}/gradio_api/file=/{raw.lstrip('/')}"
|
| 58 |
+
return raw
|
| 59 |
+
|
| 60 |
+
def postprocess_caption(text: str, max_chars: int = 1200) -> str:
|
| 61 |
+
if not text:
|
| 62 |
+
return ""
|
| 63 |
+
result = re.sub(r'^(a photo of|an image of|a picture of|this (is a photo|shows))\s*',
|
| 64 |
+
'', text.strip(), flags=re.IGNORECASE)
|
| 65 |
+
if len(result) > max_chars:
|
| 66 |
+
cut = max_chars
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
for i in range(len(result) - 1, max(0, max_chars - 100), -1):
|
| 68 |
if result[i] in '.!?':
|
| 69 |
+
cut = i + 1
|
| 70 |
break
|
| 71 |
+
result = result[:cut].strip()
|
| 72 |
+
if result and result[-1] not in '.!?':
|
| 73 |
+
result += '.'
|
| 74 |
return result
|
| 75 |
|
| 76 |
+
def _cleanup():
|
| 77 |
try:
|
| 78 |
if torch.cuda.is_available():
|
| 79 |
torch.cuda.empty_cache()
|
|
|
|
| 80 |
gc.collect()
|
| 81 |
except Exception:
|
| 82 |
pass
|
| 83 |
|
| 84 |
+
_cleanup()
|
| 85 |
|
| 86 |
+
# ββ Model setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
|
| 88 |
+
_tmp = tempfile.gettempdir()
|
| 89 |
+
for k, v in {"HF_HOME": "hf_cache", "TRANSFORMERS_CACHE": "transformers_cache",
|
| 90 |
+
"HF_DATASETS_CACHE": "datasets_cache", "TORCH_HOME": "torch_cache"}.items():
|
| 91 |
+
os.environ[k] = os.path.join(_tmp, v)
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
|
| 94 |
SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
|
| 95 |
|
| 96 |
+
print("π Loading JoyCaption v6.1...")
|
| 97 |
processor = AutoProcessor.from_pretrained(MODEL_PATH, cache_dir=None)
|
| 98 |
model = LlavaForConditionalGeneration.from_pretrained(
|
| 99 |
MODEL_PATH,
|
| 100 |
torch_dtype=torch.bfloat16,
|
| 101 |
device_map="auto",
|
| 102 |
cache_dir=None,
|
| 103 |
+
low_cpu_mem_usage=True,
|
| 104 |
)
|
| 105 |
model.eval()
|
| 106 |
+
print("β
Model loaded.")
|
| 107 |
+
|
| 108 |
+
# ββ Prompts (shortened ~40%) ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
|
|
|
|
| 110 |
DEFAULT_PROMPTS = {
|
| 111 |
"casual": {
|
| 112 |
"system": "You are a descriptive image captioner focused on settings and female subjects.",
|
| 113 |
+
"user": (
|
| 114 |
+
"Describe the setting and female subjects: clothing or nudity, body, actions, poses. "
|
| 115 |
+
"Be thorough on female details. Skip male clothing, logos, watermarks."
|
| 116 |
+
)
|
| 117 |
},
|
| 118 |
"friendly": {
|
| 119 |
+
"system": "You are an enthusiastic observer describing attractive women.",
|
| 120 |
+
"user": (
|
| 121 |
+
"Describe the women's appearance, nudity, body, and actions with enthusiasm. "
|
| 122 |
+
"Be thorough on female exposure and poses. Skip male clothing, logos, watermarks."
|
| 123 |
+
)
|
| 124 |
},
|
| 125 |
"erotic": {
|
| 126 |
+
"system": "You are an adult content specialist identifying erotic elements.",
|
| 127 |
+
"user": (
|
| 128 |
+
"Describe the women's nudity and erotic elements, then explain why this is sexually appealing. "
|
| 129 |
+
"Focus on female exposure. Skip male clothing, logos, watermarks."
|
| 130 |
+
)
|
| 131 |
}
|
| 132 |
}
|
| 133 |
|
| 134 |
+
# ββ Generation core ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
+
|
| 136 |
def safe_generate_caption_direct(image, system_prompt, user_prompt, max_chars=1200):
|
| 137 |
+
if image is None:
|
| 138 |
+
return "β No image provided"
|
| 139 |
+
if not system_prompt.strip() or not user_prompt.strip():
|
| 140 |
+
return "β Both system and user prompts are required"
|
| 141 |
try:
|
| 142 |
+
from PIL import Image as PILImage
|
| 143 |
+
pil_image = PILImage.open(image) if isinstance(image, str) else image
|
| 144 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
convo = [
|
| 146 |
{"role": "system", "content": system_prompt.strip()},
|
| 147 |
+
{"role": "user", "content": user_prompt.strip()},
|
| 148 |
]
|
| 149 |
+
convo_str = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 150 |
+
inputs = processor(text=[convo_str], images=[pil_image], return_tensors="pt").to("cuda")
|
| 151 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
| 152 |
+
|
| 153 |
+
# use_cache left at default True β KV-cache speeds up autoregressive decoding
|
| 154 |
+
# No manual seed β seeds conflict with KV-cache reuse and provide no real benefit
|
| 155 |
+
output = model.generate(
|
| 156 |
+
**inputs,
|
| 157 |
+
max_new_tokens=600,
|
| 158 |
+
do_sample=True,
|
| 159 |
+
temperature=0.8,
|
| 160 |
+
top_p=0.85,
|
| 161 |
+
top_k=50,
|
| 162 |
+
repetition_penalty=1.1,
|
| 163 |
+
no_repeat_ngram_size=3,
|
| 164 |
+
pad_token_id=processor.tokenizer.eos_token_id,
|
| 165 |
+
eos_token_id=processor.tokenizer.eos_token_id,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
input_len = inputs["input_ids"].shape[1]
|
| 169 |
+
result = processor.tokenizer.decode(
|
| 170 |
+
output[0][input_len:], skip_special_tokens=True,
|
| 171 |
+
clean_up_tokenization_spaces=False
|
| 172 |
+
).strip()
|
| 173 |
+
|
| 174 |
+
# Single cleanup after generation (removed two redundant mid-function clears)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
del inputs, output
|
| 176 |
+
_cleanup()
|
| 177 |
+
|
| 178 |
+
return postprocess_caption(result, max_chars) or "β Empty result"
|
|
|
|
|
|
|
|
|
|
| 179 |
except Exception as e:
|
| 180 |
+
_cleanup()
|
|
|
|
| 181 |
return f"β Error: {str(e)[:200]}"
|
| 182 |
|
| 183 |
+
# ββ GPU-decorated entry points ββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββ
|
| 184 |
+
|
| 185 |
+
@spaces.GPU(duration=30) # was 60; real wall-time on H200 β 12β25s
|
| 186 |
@torch.no_grad()
|
| 187 |
def generate_caption(image, system, user):
|
| 188 |
if not image:
|
| 189 |
return "β Upload image first"
|
| 190 |
return safe_generate_caption_direct(image, system, user)
|
| 191 |
|
| 192 |
+
@spaces.GPU(duration=20) # was 40; Q&A is shorter (max_new_tokens=300)
|
|
|
|
| 193 |
@torch.no_grad()
|
| 194 |
def answer_question(image, question):
|
| 195 |
if not image:
|
|
|
|
| 197 |
if not question.strip():
|
| 198 |
return "β Please ask a question"
|
| 199 |
try:
|
| 200 |
+
from PIL import Image as PILImage
|
| 201 |
+
pil_image = PILImage.open(image) if isinstance(image, str) else image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
convo = [
|
| 203 |
+
{"role": "system", "content": "You are a helpful image analyst."},
|
| 204 |
+
{"role": "user", "content": question.strip()},
|
| 205 |
]
|
| 206 |
+
convo_str = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 207 |
+
inputs = processor(text=[convo_str], images=[pil_image], return_tensors="pt").to("cuda")
|
| 208 |
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
| 209 |
+
output = model.generate(**inputs, max_new_tokens=300, do_sample=True,
|
| 210 |
+
temperature=0.6, top_p=0.9,
|
| 211 |
+
pad_token_id=processor.tokenizer.eos_token_id,
|
| 212 |
+
eos_token_id=processor.tokenizer.eos_token_id)
|
| 213 |
+
result = processor.tokenizer.decode(
|
| 214 |
+
output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
| 215 |
del inputs, output
|
| 216 |
+
_cleanup()
|
|
|
|
| 217 |
return postprocess_caption(result, max_chars=500) or "β No answer generated"
|
| 218 |
except Exception as e:
|
| 219 |
+
_cleanup()
|
|
|
|
| 220 |
return f"β Q&A Error: {str(e)[:200]}"
|
| 221 |
|
| 222 |
+
# ββ Template helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 223 |
+
|
| 224 |
+
def _ins(text, tpl, content):
|
| 225 |
+
formatted = tpl.format(content=content.strip())
|
| 226 |
+
if not content.strip() or formatted in text:
|
| 227 |
+
return text
|
| 228 |
+
return (text.rstrip() + " " + formatted).strip()
|
|
|
|
| 229 |
|
| 230 |
def create_template_functions():
|
| 231 |
+
key_f = lambda s, u, c: (s, _ins(u, "Pay attention to these keywords: {content}.", c))
|
| 232 |
+
que_f = lambda s, u, c: (s, _ins(u, "Answer this question: {content}.", c))
|
| 233 |
+
use_f = lambda s, u, c: (s, _ins(u, "Make sure that you mention: {content}.", c))
|
| 234 |
+
not_f = lambda s, u, c: (s, _ins(u, "Do NOT mention: {content}.", c))
|
| 235 |
+
return key_f, que_f, use_f, not_f
|
| 236 |
+
|
| 237 |
+
# ββ Export ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 238 |
+
|
| 239 |
+
def export_joycaption_data(tags, mention, avoid, ask, c1, c2, c3, qa_ans, img):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
try:
|
| 241 |
data = {
|
| 242 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 243 |
+
"source": "JoyCaption Advanced Prompting System v6.1",
|
| 244 |
"data": {}
|
| 245 |
}
|
| 246 |
+
d = data["data"]
|
| 247 |
+
if tags and tags.strip(): d["tags"] = tags.strip()
|
| 248 |
+
if mention and mention.strip(): d["mention"] = mention.strip()
|
| 249 |
+
if avoid and avoid.strip(): d["avoid"] = avoid.strip()
|
| 250 |
+
if ask and ask.strip(): d["ask"] = ask.strip()
|
| 251 |
+
|
|
|
|
|
|
|
|
|
|
| 252 |
if img:
|
| 253 |
+
if isinstance(img, str) and os.path.exists(img):
|
| 254 |
+
url = fix_image_url(img, host=(SPACE_HOST or ""))
|
| 255 |
+
d["image_path"] = url if url != img else img
|
| 256 |
+
else:
|
| 257 |
+
d["image_error"] = f"Invalid path: {type(img).__name__}"
|
| 258 |
+
|
| 259 |
+
qa_obj = {}
|
| 260 |
+
if ask and ask.strip(): qa_obj["question"] = ask.strip()
|
| 261 |
+
if qa_ans and qa_ans.strip(): qa_obj["answer"] = qa_ans.strip()
|
| 262 |
+
if qa_obj: d["qa"] = qa_obj
|
| 263 |
+
|
| 264 |
+
descs = {}
|
| 265 |
+
if c1 and c1.strip(): descs["casual"] = c1.strip()
|
| 266 |
+
if c2 and c2.strip(): descs["friendly"] = c2.strip()
|
| 267 |
+
if c3 and c3.strip(): descs["erotic"] = c3.strip()
|
| 268 |
+
if descs: d["descriptions"] = descs
|
| 269 |
+
|
| 270 |
+
if not d:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
return "β No data to export", None
|
| 272 |
+
|
| 273 |
js = json.dumps(data, indent=2, ensure_ascii=False)
|
| 274 |
fn = f"joycaption_{time.strftime('%Y%m%d_%H%M%S')}.json"
|
| 275 |
+
path = os.path.join(tempfile.gettempdir(), fn)
|
| 276 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 277 |
+
f.write(js)
|
| 278 |
+
return f"β
Exported {len(d)} fields", path
|
| 279 |
except Exception as e:
|
| 280 |
return f"β Export failed: {str(e)}", None
|
| 281 |
|
| 282 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 283 |
+
|
| 284 |
with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Soft()) as demo:
|
| 285 |
gr.HTML("<style>textarea{resize:none!important;}</style>")
|
| 286 |
+
gr.HTML("<h1 style='text-align:center;margin-top:10px;'>"
|
| 287 |
+
"π¨ JoyCaption Advanced Prompting System (v6.1)</h1><hr>")
|
| 288 |
|
| 289 |
+
key_f, que_f, use_f, not_f = create_template_functions()
|
| 290 |
|
| 291 |
with gr.Row():
|
| 292 |
+
# ββ Left column: inputs ββββββββββββββββββββββββββββββββββββββββββ
|
| 293 |
with gr.Column(scale=1):
|
| 294 |
+
image_input = gr.Image(
|
| 295 |
+
type="filepath", label="πΈ Image",
|
| 296 |
+
elem_id="joy_image_input"
|
| 297 |
+
)
|
| 298 |
+
keywords_input = gr.Textbox(label="π·οΈ Tags", lines=2,
|
| 299 |
+
placeholder="e.g. beach, sunset",
|
| 300 |
+
elem_id="joy_tags_input")
|
| 301 |
+
custom_inst_input = gr.Textbox(label="π― Mention", lines=2,
|
| 302 |
+
placeholder="Extra instructions",
|
| 303 |
+
elem_id="joy_mention_input")
|
| 304 |
+
avoid_input = gr.Textbox(label="π« Avoid", lines=2,
|
| 305 |
+
placeholder="Things to avoid",
|
| 306 |
+
elem_id="joy_avoid_input")
|
| 307 |
+
question_input = gr.Textbox(label="β Ask", lines=2,
|
| 308 |
+
placeholder="Ask about image",
|
| 309 |
+
elem_id="joy_ask_input")
|
| 310 |
+
ask_btn = gr.Button("Ask", variant="secondary", elem_id="joy_ask_btn")
|
| 311 |
+
qa_output = gr.Textbox(label="Answer", lines=3, show_copy_button=True,
|
| 312 |
+
elem_id="joy_output_qa")
|
| 313 |
|
| 314 |
+
# ββ Right column: tabs βββββββββββββββββββββββββββββββββββββββββββ
|
| 315 |
with gr.Column(scale=1):
|
| 316 |
+
with gr.Tab("π Casual"):
|
| 317 |
gr.Markdown("**System Prompt**")
|
| 318 |
+
system1 = gr.Textbox(show_label=False,
|
| 319 |
+
value=DEFAULT_PROMPTS["casual"]["system"], lines=3)
|
| 320 |
gr.Markdown("**User Prompt**")
|
| 321 |
+
user1 = gr.Textbox(show_label=False,
|
| 322 |
+
value=DEFAULT_PROMPTS["casual"]["user"], lines=3)
|
| 323 |
gr.Markdown("**Insert Template**")
|
| 324 |
with gr.Row():
|
| 325 |
+
key_btn = gr.Button("Tags", size="sm")
|
| 326 |
+
use_btn = gr.Button("Mention", size="sm")
|
| 327 |
+
not_btn = gr.Button("Avoid", size="sm")
|
| 328 |
+
que_btn = gr.Button("Ask", size="sm")
|
| 329 |
+
gen1_btn = gr.Button("Generate Casual", variant="primary",
|
| 330 |
+
elem_id="joy_btn_casual")
|
| 331 |
gr.Markdown("**Caption:**")
|
| 332 |
+
out1 = gr.Textbox(show_label=False, lines=5, show_copy_button=True,
|
| 333 |
+
elem_id="joy_output_casual")
|
| 334 |
|
| 335 |
+
with gr.Tab("π€ Friendly"):
|
| 336 |
gr.Markdown("**System Prompt**")
|
| 337 |
+
system2 = gr.Textbox(show_label=False,
|
| 338 |
+
value=DEFAULT_PROMPTS["friendly"]["system"], lines=3)
|
| 339 |
gr.Markdown("**User Prompt**")
|
| 340 |
+
user2 = gr.Textbox(show_label=False,
|
| 341 |
+
value=DEFAULT_PROMPTS["friendly"]["user"], lines=3)
|
| 342 |
gr.Markdown("**Insert Template**")
|
| 343 |
with gr.Row():
|
| 344 |
+
key2_btn = gr.Button("Tags", size="sm")
|
| 345 |
use2_btn = gr.Button("Mention", size="sm")
|
| 346 |
+
not2_btn = gr.Button("Avoid", size="sm")
|
| 347 |
+
que2_btn = gr.Button("Ask", size="sm")
|
| 348 |
+
gen2_btn = gr.Button("Generate Friendly", variant="primary",
|
| 349 |
+
elem_id="joy_btn_friendly")
|
| 350 |
gr.Markdown("**Caption:**")
|
| 351 |
+
out2 = gr.Textbox(show_label=False, lines=5, show_copy_button=True,
|
| 352 |
+
elem_id="joy_output_friendly")
|
| 353 |
|
| 354 |
+
with gr.Tab("π₯ Erotic"):
|
| 355 |
gr.Markdown("**System Prompt**")
|
| 356 |
+
system3 = gr.Textbox(show_label=False,
|
| 357 |
+
value=DEFAULT_PROMPTS["erotic"]["system"], lines=3)
|
| 358 |
gr.Markdown("**User Prompt**")
|
| 359 |
+
user3 = gr.Textbox(show_label=False,
|
| 360 |
+
value=DEFAULT_PROMPTS["erotic"]["user"], lines=3)
|
| 361 |
gr.Markdown("**Insert Template**")
|
| 362 |
with gr.Row():
|
| 363 |
+
key3_btn = gr.Button("Tags", size="sm")
|
| 364 |
use3_btn = gr.Button("Mention", size="sm")
|
| 365 |
+
not3_btn = gr.Button("Avoid", size="sm")
|
| 366 |
+
que3_btn = gr.Button("Ask", size="sm")
|
| 367 |
+
gen3_btn = gr.Button("Generate Erotic", variant="primary",
|
| 368 |
+
elem_id="joy_btn_erotic")
|
| 369 |
gr.Markdown("**Caption:**")
|
| 370 |
+
out3 = gr.Textbox(show_label=False, lines=5, show_copy_button=True,
|
| 371 |
+
elem_id="joy_output_erotic")
|
| 372 |
|
| 373 |
gr.Markdown("---")
|
| 374 |
+
export_btn = gr.Button("π¦ Export JSON", variant="secondary")
|
| 375 |
+
export_msg = gr.Textbox(visible=False)
|
| 376 |
export_file = gr.File(visible=False)
|
| 377 |
|
| 378 |
+
# ββ Clear outputs when a new image is uploaded βββββββββββββββββββββββββ
|
| 379 |
+
# Runs client-side with queue=False β no GPU cost, no ZeroGPU reservation.
|
| 380 |
+
# Prevents "Error" text from a previous failed generation persisting into
|
| 381 |
+
# the next upload and confusing the user.
|
| 382 |
+
image_input.change(
|
| 383 |
+
lambda: ("", "", ""), inputs=None, outputs=[out1, out2, out3], queue=False
|
| 384 |
+
)
|
| 385 |
|
| 386 |
+
# ββ Caption generation ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 387 |
gen1_btn.click(generate_caption, [image_input, system1, user1], out1)
|
| 388 |
gen2_btn.click(generate_caption, [image_input, system2, user2], out2)
|
| 389 |
gen3_btn.click(generate_caption, [image_input, system3, user3], out3)
|
| 390 |
ask_btn.click(answer_question, [image_input, question_input], qa_output)
|
| 391 |
|
| 392 |
+
# ββ Template insertion βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 393 |
+
_common = [keywords_input, custom_inst_input, question_input, avoid_input]
|
| 394 |
+
for btn, fn_type, sys_box, usr_box in [
|
| 395 |
+
(key_btn, "key", system1, user1), (use_btn, "use", system1, user1),
|
| 396 |
+
(not_btn, "not", system1, user1), (que_btn, "que", system1, user1),
|
| 397 |
+
(key2_btn, "key", system2, user2), (use2_btn, "use", system2, user2),
|
| 398 |
+
(not2_btn, "not", system2, user2), (que2_btn, "que", system2, user2),
|
| 399 |
+
(key3_btn, "key", system3, user3), (use3_btn, "use", system3, user3),
|
| 400 |
+
(not3_btn, "not", system3, user3), (que3_btn, "que", system3, user3),
|
| 401 |
+
]:
|
| 402 |
+
_fn_map = {"key": key_f, "use": use_f, "not": not_f, "que": que_f}
|
| 403 |
+
_fn = _fn_map[fn_type]
|
| 404 |
+
_sb, _ub = sys_box, usr_box
|
| 405 |
+
btn.click(
|
| 406 |
+
lambda s, u, k, c, q, a, _f=_fn: _f(s, u, {"key": k, "que": q, "use": c, "not": a}[fn_type]),
|
| 407 |
+
[_sb, _ub] + _common, [_sb, _ub]
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# ββ Export ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 411 |
+
def _handle_export(k, c, a, q, c1, c2, c3, qa, img):
|
| 412 |
+
msg, path = export_joycaption_data(k, c, a, q, c1, c2, c3, qa, img)
|
| 413 |
+
if path:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
return gr.update(value=msg, visible=True), gr.update(value=path, visible=True)
|
| 415 |
return gr.update(value=msg, visible=True), gr.update(visible=False)
|
| 416 |
|
| 417 |
export_btn.click(
|
| 418 |
+
_handle_export,
|
| 419 |
+
[keywords_input, custom_inst_input, avoid_input, question_input,
|
| 420 |
out1, out2, out3, qa_output, image_input],
|
| 421 |
+
[export_msg, export_file]
|
| 422 |
)
|
| 423 |
|
| 424 |
if __name__ == "__main__":
|
| 425 |
+
demo.launch()
|