nickdigger commited on
Commit
6fe2bf5
Β·
verified Β·
1 Parent(s): ed602d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -345
app.py CHANGED
@@ -13,16 +13,14 @@ except Exception:
13
  return _wrap
14
  spaces.GPU = _spaces_gpu
15
 
16
- @spaces.GPU()
17
- def _joycaption_register_gpu():
18
- return None
19
-
20
  import gradio as gr
21
  import torch
22
- from transformers import LlavaForConditionalGeneration, AutoProcessor
23
  from PIL import Image
24
- import tempfile, gc, os, shutil, json, time, re
25
  from pathlib import Path
 
 
26
  from hf_space_utils import fix_image_url, postprocess_caption
27
 
28
  # ===== Storage cleanup setup =====
@@ -48,330 +46,190 @@ def cleanup_storage():
48
 
49
  TITLE = """
50
  <div style="text-align:center;margin:20px 0;">
51
- <h1>🎨 JoyCaption Three-Tone + Q&A (v3.3 ZeroGPU)</h1>
52
- <p><strong>βœ… Blog-friendly captions β€’ SEO optimized β€’ Longer descriptions β€’ ZeroGPU optimized</strong></p>
53
  </div><hr>
54
  """
55
 
56
  print("πŸš€ Loading JoyCaption model...")
57
- processor = None
58
- model = None
59
-
60
- if not os.environ.get("SKIP_MODEL_LOAD"):
61
- processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
62
- model = LlavaForConditionalGeneration.from_pretrained(
63
- MODEL_PATH,
64
- torch_dtype=torch.bfloat16,
65
- device_map="auto",
66
- low_cpu_mem_usage=True
67
- )
68
- model.eval()
69
- print("βœ… Model loaded successfully!")
70
- cleanup_storage()
71
- else:
72
- print("⚠️ SKIP_MODEL_LOAD=1 β†’ model skipped")
73
 
74
- # ===== BLOG-FRIENDLY PROMPT CONFIGURATIONS =====
75
  CAPTION_PROMPTS = {
76
- "engaging": {
77
- "system": "You are a helpful image captioner.",
78
- "prompt": "Write a descriptive caption for this image. Describe the scene, people, their appearance, clothing, poses, and setting in a natural, engaging way.",
79
- "temperature": 0.7,
80
- "top_p": 0.9,
81
- "max_tokens": 600
82
- },
83
- "casual_friend": {
84
- "system": "You are an uncensored content specialist who creates detailed, explicit descriptions.",
85
- "prompt": "Create a detailed, explicit description of this scene. Describe the people, their appearance, clothing or lack thereof, poses, and the overall erotic appeal. Use direct adult language and be thorough.",
86
- "temperature": 0.6,
87
- "top_p": 0.9,
88
- "max_tokens": 600
89
- },
90
- "uncensored_keywords": {
91
- "system": "You are a helpful image captioner.",
92
- "prompt": "Describe this image.",
93
- "temperature": 0.6,
94
- "top_p": 0.9,
95
- "max_tokens": 600
96
- }
97
  }
98
 
99
- def apply_smart_corrections(text):
100
- """Apply smart corrections for common AI vision errors"""
101
- if not text or not isinstance(text, str):
102
- return text
103
-
104
- corrections = {
105
- r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
106
- r'\btopless women\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
107
- r'\btopless woman\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
108
- r'\bwearing nothing\b': 'nude',
109
- r'\bnot wearing.*clothes\b': 'nude',
110
- r'\bcompletely naked\b': 'nude',
111
- r'\bfully nude\b': 'nude',
112
- }
113
 
114
- corrected_text = text
115
- try:
116
- for pattern, replacement in corrections.items():
117
- if callable(replacement):
118
- corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
119
- else:
120
- corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
121
- except Exception as e:
122
- print(f"Error in corrections: {e}")
123
- return text
 
 
 
 
 
 
 
124
 
125
- return corrected_text
 
 
 
 
 
 
 
 
 
 
126
 
127
- def safe_generate_caption_direct(image, tone, max_chars=1000, keywords_text="", custom_instruction=""):
128
- """Generate caption using blog-friendly approach"""
129
- try:
130
- if image is None:
131
- return f"❌ No image provided for {tone}"
132
-
133
- caption_config = CAPTION_PROMPTS.get(tone, CAPTION_PROMPTS["engaging"])
134
- base_prompt = caption_config["prompt"]
135
-
136
- # Handle keywords for casual_friend (erotic) tone
137
- if tone == "casual_friend" and keywords_text and keywords_text.strip():
138
- base_prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
139
-
140
- # Handle custom instructions and keywords for uncensored_keywords (third) tone ONLY
141
- if tone == "uncensored_keywords":
142
- if custom_instruction and custom_instruction.strip():
143
- base_prompt += f" Make sure that you mention: {custom_instruction.strip()}"
144
- if keywords_text and keywords_text.strip():
145
- base_prompt += f" You are allowed to use these keywords: {keywords_text.strip()}"
146
-
147
- # Use conversation format
148
- convo = [
149
- {"role": "system", "content": caption_config["system"]},
150
- {"role": "user", "content": base_prompt}
151
- ]
152
-
153
- convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
154
- inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
155
-
156
- device = next(model.parameters()).device
157
- inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
158
-
159
- if 'pixel_values' in inputs:
160
- inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
161
-
162
- temperature = caption_config.get("temperature", 0.4)
163
- top_p = caption_config.get("top_p", 0.8)
164
- max_tokens = caption_config.get("max_tokens", 600)
165
-
166
- with torch.no_grad():
167
- output = model.generate(
168
- **inputs,
169
- max_new_tokens=max_tokens,
170
- do_sample=True,
171
- temperature=temperature,
172
- top_p=top_p,
173
- top_k=None,
174
- repetition_penalty=1.1,
175
- use_cache=True,
176
- pad_token_id=processor.tokenizer.eos_token_id,
177
- eos_token_id=processor.tokenizer.eos_token_id
178
- )
179
-
180
- if output is None or len(output) == 0:
181
- return f"❌ No output generated for {tone}"
182
-
183
- # Proper decoding
184
- if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
185
- input_length = inputs['input_ids'].shape[1]
186
- if len(output[0]) > input_length:
187
- generate_ids = output[0][input_length:]
188
- result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
189
- else:
190
- result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
191
- else:
192
- result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
193
-
194
- result = result.strip()
195
- result = apply_smart_corrections(result)
196
-
197
- # Cleanup after generation
198
- del inputs, output
199
- gc.collect()
200
-
201
- # Apply postprocessing
202
- final_result = postprocess_caption(result, max_chars=max_chars)
203
-
204
- return final_result if final_result else f"❌ Empty result for {tone}"
205
-
206
- except Exception as e:
207
- gc.collect()
208
- return f"❌ Error: {str(e)[:200]}"
209
 
210
- def safe_generate_custom_prompt(image, system_prompt, user_prompt, max_chars=1000):
211
- """Generate caption using custom system and user prompts for playground"""
212
- try:
213
- if image is None:
214
- return "❌ No image provided"
215
-
216
- if not system_prompt or not system_prompt.strip():
217
- return "❌ System prompt is required"
218
-
219
- if not user_prompt or not user_prompt.strip():
220
- return "❌ User prompt is required"
221
-
222
- # Use custom prompts
223
- convo = [
224
- {"role": "system", "content": system_prompt.strip()},
225
- {"role": "user", "content": user_prompt.strip()}
226
- ]
227
-
228
- convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
229
- inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
230
-
231
- device = next(model.parameters()).device
232
- inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
233
-
234
- if 'pixel_values' in inputs:
235
- inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
236
-
237
- with torch.no_grad():
238
- output = model.generate(
239
- **inputs,
240
- max_new_tokens=600,
241
- do_sample=True,
242
- temperature=0.6,
243
- top_p=0.9,
244
- top_k=None,
245
- repetition_penalty=1.1,
246
- use_cache=True,
247
- pad_token_id=processor.tokenizer.eos_token_id,
248
- eos_token_id=processor.tokenizer.eos_token_id
249
- )
250
-
251
- if output is None or len(output) == 0:
252
- return "❌ No output generated"
253
-
254
- # Proper decoding
255
- if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
256
- input_length = inputs['input_ids'].shape[1]
257
- if len(output[0]) > input_length:
258
- generate_ids = output[0][input_length:]
259
- result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
260
- else:
261
- result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
262
- else:
263
- result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
264
-
265
- result = result.strip()
266
- result = apply_smart_corrections(result)
267
-
268
- # Cleanup after generation
269
- del inputs, output
270
- gc.collect()
271
-
272
- # Apply postprocessing
273
- final_result = postprocess_caption(result, max_chars=max_chars)
274
-
275
- return final_result if final_result else "❌ Empty result"
276
 
277
- except Exception as e:
278
- gc.collect()
279
- return f"❌ Error: {str(e)[:200]}"
280
 
281
- # Individual GPU-decorated functions for CAPTIONS
282
- @spaces.GPU(duration=50)
283
- @torch.no_grad()
284
- def generate_engaging_only(image, custom_instruction=""):
285
- result = safe_generate_caption_direct(image, "engaging", max_chars=1000, custom_instruction=custom_instruction) if image else "❌ Upload image first"
286
- gc.collect()
287
- return result
 
 
 
 
288
 
289
- @spaces.GPU(duration=50)
290
- @torch.no_grad()
291
- def generate_casual_friend_only(image, keywords_text="", custom_instruction=""):
292
- result = safe_generate_caption_direct(image, "casual_friend", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
293
- gc.collect()
294
- return result
295
 
296
- @spaces.GPU(duration=50)
297
- @torch.no_grad()
298
- def generate_uncensored_keywords_only(image, keywords_text="", custom_instruction=""):
299
- result = safe_generate_caption_direct(image, "uncensored_keywords", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
300
- gc.collect()
301
- return result
302
 
303
- # Playground function
304
- @spaces.GPU(duration=50)
305
- @torch.no_grad()
306
- def generate_playground(image, system_prompt, user_prompt):
307
- result = safe_generate_custom_prompt(image, system_prompt, user_prompt, max_chars=1000) if image else "❌ Upload image first"
308
- gc.collect()
309
- return result
 
 
 
 
310
 
311
- # Separate Q&A function - keep this accurate and focused
312
- @spaces.GPU(duration=40)
313
- @torch.no_grad()
314
- def answer_question(image, question):
315
- """Answer questions about the image - focused and accurate"""
 
 
 
 
 
316
  if not image:
317
  return "❌ Upload image first"
318
- if not question or not question.strip():
319
- return "❌ Please ask a question"
320
-
321
- # Short, direct Q&A prompt
322
- qa_prompt = f"Answer this question about the image: {question.strip()}"
323
 
324
- # Simple system message
325
- convo = [
326
- {"role": "system", "content": "You are a helpful image analyst."},
327
- {"role": "user", "content": qa_prompt}
328
- ]
329
 
330
- convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
331
- inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
332
-
333
- device = next(model.parameters()).device
334
- inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
 
 
335
 
336
- if 'pixel_values' in inputs:
337
- inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
 
338
 
339
- with torch.no_grad():
340
- output = model.generate(
341
- **inputs,
342
- max_new_tokens=200,
343
- do_sample=True,
344
- temperature=0.3, # Lower temperature for more accurate Q&A
345
- top_p=0.8,
346
- top_k=None,
347
- repetition_penalty=1.1,
348
- use_cache=True,
349
- pad_token_id=processor.tokenizer.eos_token_id,
350
- eos_token_id=processor.tokenizer.eos_token_id
351
- )
352
 
353
- # Decode result
354
- if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
355
- input_length = inputs['input_ids'].shape[1]
356
- if len(output[0]) > input_length:
357
- generate_ids = output[0][input_length:]
358
- result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
359
- else:
360
- result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
361
- else:
362
- result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
363
 
364
- result = result.strip()
 
 
 
 
 
 
 
365
 
366
- # Cleanup
367
- del inputs, output
368
- gc.collect()
369
 
370
- final_result = postprocess_caption(result, max_chars=300)
371
  return final_result if final_result else "❌ No answer generated"
372
 
373
- # ===== Export =====
374
- def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path=""):
375
  try:
376
  data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source":"JoyCaption","data":{}}
377
  if keywords.strip(): data["data"]["keywords"]=keywords.strip()
@@ -382,8 +240,8 @@ def export_joycaption_data(keywords, custom_instructions, question, engaging_cap
382
  image_url=fix_image_url(image_path, host=(SPACE_HOST or ""))
383
  if image_url: data["data"]["image_url"]=image_url
384
  if engaging_caption.strip(): data["data"]["caption_engaging"]=engaging_caption.strip()
385
- if casual_caption.strip(): data["data"]["caption_casual_friend"]=casual_caption.strip()
386
- if keywords_caption.strip(): data["data"]["caption_keywords"]=keywords_caption.strip()
387
  if qa_answer.strip(): data["data"]["qa_answer"]=qa_answer.strip()
388
  if not data["data"]:
389
  return "❌ No data to export", None
@@ -399,26 +257,31 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
399
  with gr.Row():
400
  with gr.Column(scale=1):
401
  image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image", height=400)
402
- keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2)
403
- custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2)
404
- question_input = gr.Textbox(label="❓ Ask Question", lines=2)
405
  ask_btn = gr.Button("❓ Ask", variant="secondary")
406
- qa_output = gr.Textbox(label="Q&A", lines=4, show_copy_button=True)
 
407
  with gr.Column(scale=1):
408
- g1 = gr.Button("πŸ“ Casual Descriptive", variant="primary")
409
- out1 = gr.Textbox(lines=7, show_copy_button=True)
410
- g2 = gr.Button("πŸ”₯ Erotic", variant="secondary")
411
- out2 = gr.Textbox(lines=7, show_copy_button=True)
412
- g3 = gr.Button("🎯 Custom Instruction", variant="secondary")
413
- out3 = gr.Textbox(lines=7, show_copy_button=True)
414
- export_btn = gr.Button("πŸ“₯ Export All Data")
 
 
 
415
  export_out = gr.Textbox(visible=False)
416
  export_file = gr.File(visible=False)
417
 
418
- g1.click(generate_engaging_only, [image_input, custom_instruction_input], out1)
419
- g2.click(generate_casual_friend_only, [image_input, keywords_input, custom_instruction_input], out2)
420
- g3.click(generate_uncensored_keywords_only, [image_input, keywords_input, custom_instruction_input], out3)
421
- ask_btn.click(answer_question, [image_input, question_input], qa_output)
 
422
 
423
  def handle_export(k, c, q, e1, e2, e3, qa, img):
424
  msg, fd = export_joycaption_data(k,c,q,e1,e2,e3,qa,img)
@@ -430,35 +293,27 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
430
  return gr.update(value=msg,visible=True), gr.update(visible=False)
431
 
432
  export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
433
-
434
- # ===== PLAYGROUND SECTION =====
435
- gr.HTML("<hr><h2>πŸ§ͺ Playground - Custom Prompts</h2><p>Test custom system and user prompts (not included in JSON export)</p>")
436
 
437
- with gr.Row():
438
- with gr.Column(scale=1):
439
- playground_system = gr.Textbox(
440
- label="πŸ”§ System Prompt",
441
- lines=2,
442
- value="You are a helpful image captioner.",
443
- placeholder="Enter custom system prompt..."
444
- )
445
- playground_prompt = gr.Textbox(
446
- label="πŸ’¬ User Prompt",
447
- lines=3,
448
- value="Describe this image in detail. Include the people, their appearance, clothing, poses, expressions, and the setting.",
449
- placeholder="Enter custom user prompt..."
450
- )
451
- playground_btn = gr.Button("πŸ§ͺ Generate", variant="secondary")
452
-
453
- with gr.Column(scale=1):
454
- playground_output = gr.Textbox(
455
- label="🎯 Playground Output",
456
- lines=7,
457
- show_copy_button=True,
458
- placeholder="Custom prompt results will appear here..."
459
- )
460
-
461
- playground_btn.click(generate_playground, [image_input, playground_system, playground_prompt], playground_output)
462
 
463
  if __name__ == "__main__":
464
  demo.launch()
 
13
  return _wrap
14
  spaces.GPU = _spaces_gpu
15
 
 
 
 
 
16
  import gradio as gr
17
  import torch
18
+ from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
19
  from PIL import Image
20
+ import tempfile, gc, os, shutil, json, time
21
  from pathlib import Path
22
+ from threading import Thread
23
+ from typing import Generator
24
  from hf_space_utils import fix_image_url, postprocess_caption
25
 
26
  # ===== Storage cleanup setup =====
 
46
 
47
  TITLE = """
48
  <div style="text-align:center;margin:20px 0;">
49
+ <h1>🎨 JoyCaption Three-Tone + Q&A (v4.0 ZeroGPU)</h1>
50
+ <p><strong>βœ… Restored working prompts β€’ Natural model behavior β€’ Proven parameters</strong></p>
51
  </div><hr>
52
  """
53
 
54
  print("πŸš€ Loading JoyCaption model...")
55
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
56
+ model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
57
+ model.eval()
58
+ cleanup_storage()
59
+ print("βœ… Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # ===== WORKING CAPTION PROMPTS (from beta-one space) =====
62
  CAPTION_PROMPTS = {
63
+ "engaging": "Write a descriptive caption for this image in a casual tone.",
64
+ "erotic": "Write a detailed, explicit description of this scene. Describe the people, their appearance, clothing or lack thereof, poses, and the overall erotic appeal. Use direct adult language and be thorough.",
65
+ "custom": "Write a detailed description for this image."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  }
67
 
68
+ @spaces.GPU()
69
+ @torch.no_grad()
70
+ def generate_caption(image: Image.Image, tone: str, keywords_text: str = "", custom_instruction: str = "") -> Generator[str, None, None]:
71
+ """Generate caption using proven working method from beta-one space"""
72
+ torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
73
 
74
+ if image is None:
75
+ yield "❌ No image provided"
76
+ return
77
+
78
+ # Build prompt based on tone (like the working spaces)
79
+ if tone == "engaging":
80
+ prompt = CAPTION_PROMPTS["engaging"]
81
+ elif tone == "erotic":
82
+ prompt = CAPTION_PROMPTS["erotic"]
83
+ if keywords_text and keywords_text.strip():
84
+ prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
85
+ elif tone == "custom":
86
+ prompt = CAPTION_PROMPTS["custom"]
87
+ if custom_instruction and custom_instruction.strip():
88
+ prompt += f" Make sure that you mention: {custom_instruction.strip()}"
89
+ if keywords_text and keywords_text.strip():
90
+ prompt += f" You are allowed to use these keywords: {keywords_text.strip()}"
91
 
92
+ # Use the working system prompt from beta-one space
93
+ convo = [
94
+ {
95
+ "role": "system",
96
+ "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
97
+ },
98
+ {
99
+ "role": "user",
100
+ "content": prompt.strip(),
101
+ },
102
+ ]
103
 
104
+ # Use the exact same processing as beta-one space
105
+ convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
106
+ inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
107
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
110
+
111
+ # Use the exact same parameters as beta-one space (NO repetition penalty!)
112
+ generate_kwargs = dict(
113
+ **inputs,
114
+ max_new_tokens=600,
115
+ do_sample=True,
116
+ temperature=0.6, # Proven working value
117
+ top_k=None,
118
+ top_p=0.9, # Proven working value
119
+ use_cache=True,
120
+ streamer=streamer,
121
+ )
122
+
123
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
124
+ t.start()
125
+
126
+ outputs = []
127
+ for text in streamer:
128
+ outputs.append(text)
129
+ yield "".join(outputs)
130
+
131
+ @spaces.GPU()
132
+ @torch.no_grad()
133
+ def answer_question(image: Image.Image, question: str) -> Generator[str, None, None]:
134
+ """Q&A using proven working method from VQA space"""
135
+ torch.cuda.empty_cache()
136
+
137
+ if image is None:
138
+ yield "❌ No image provided"
139
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ if not question or not question.strip():
142
+ yield "❌ Please ask a question"
143
+ return
144
 
145
+ # Use the exact same approach as the working VQA space
146
+ convo = [
147
+ {
148
+ "role": "system",
149
+ "content": "You are a helpful image captioner.", # From VQA space
150
+ },
151
+ {
152
+ "role": "user",
153
+ "content": question.strip(), # Direct user input like VQA space
154
+ },
155
+ ]
156
 
157
+ convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
158
+ inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
159
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
 
 
 
160
 
161
+ streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
162
 
163
+ # Use VQA space parameters (NO repetition penalty!)
164
+ generate_kwargs = dict(
165
+ **inputs,
166
+ max_new_tokens=300,
167
+ do_sample=True,
168
+ temperature=0.6, # From VQA space
169
+ top_k=None,
170
+ top_p=0.9, # From VQA space
171
+ use_cache=True,
172
+ streamer=streamer,
173
+ )
174
 
175
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
176
+ t.start()
177
+
178
+ outputs = []
179
+ for text in streamer:
180
+ outputs.append(text)
181
+ yield "".join(outputs)
182
+
183
+ # Wrapper functions for gradio (non-streaming for simplicity)
184
+ def generate_engaging_caption(image, custom_instruction=""):
185
  if not image:
186
  return "❌ Upload image first"
 
 
 
 
 
187
 
188
+ result = ""
189
+ for chunk in generate_caption(image, "engaging", custom_instruction=custom_instruction):
190
+ result = chunk
 
 
191
 
192
+ # Apply the same postprocessing
193
+ final_result = postprocess_caption(result, max_chars=1000)
194
+ return final_result if final_result else "❌ No result generated"
195
+
196
+ def generate_erotic_caption(image, keywords_text="", custom_instruction=""):
197
+ if not image:
198
+ return "❌ Upload image first"
199
 
200
+ result = ""
201
+ for chunk in generate_caption(image, "erotic", keywords_text=keywords_text, custom_instruction=custom_instruction):
202
+ result = chunk
203
 
204
+ final_result = postprocess_caption(result, max_chars=1000)
205
+ return final_result if final_result else "❌ No result generated"
206
+
207
+ def generate_custom_caption(image, keywords_text="", custom_instruction=""):
208
+ if not image:
209
+ return "❌ Upload image first"
 
 
 
 
 
 
 
210
 
211
+ result = ""
212
+ for chunk in generate_caption(image, "custom", keywords_text=keywords_text, custom_instruction=custom_instruction):
213
+ result = chunk
 
 
 
 
 
 
 
214
 
215
+ final_result = postprocess_caption(result, max_chars=1000)
216
+ return final_result if final_result else "❌ No result generated"
217
+
218
+ def ask_question(image, question):
219
+ if not image:
220
+ return "❌ Upload image first"
221
+ if not question or not question.strip():
222
+ return "❌ Please ask a question"
223
 
224
+ result = ""
225
+ for chunk in answer_question(image, question):
226
+ result = chunk
227
 
228
+ final_result = postprocess_caption(result, max_chars=400)
229
  return final_result if final_result else "❌ No answer generated"
230
 
231
+ # ===== Export function =====
232
+ def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, erotic_caption, custom_caption, qa_answer, image_path=""):
233
  try:
234
  data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source":"JoyCaption","data":{}}
235
  if keywords.strip(): data["data"]["keywords"]=keywords.strip()
 
240
  image_url=fix_image_url(image_path, host=(SPACE_HOST or ""))
241
  if image_url: data["data"]["image_url"]=image_url
242
  if engaging_caption.strip(): data["data"]["caption_engaging"]=engaging_caption.strip()
243
+ if erotic_caption.strip(): data["data"]["caption_erotic"]=erotic_caption.strip()
244
+ if custom_caption.strip(): data["data"]["caption_custom"]=custom_caption.strip()
245
  if qa_answer.strip(): data["data"]["qa_answer"]=qa_answer.strip()
246
  if not data["data"]:
247
  return "❌ No data to export", None
 
257
  with gr.Row():
258
  with gr.Column(scale=1):
259
  image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image", height=400)
260
+ keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2, placeholder="Optional: Keywords for erotic/custom captions")
261
+ custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2, placeholder="Optional: Custom instruction for third caption")
262
+ question_input = gr.Textbox(label="❓ Ask Question", lines=2, placeholder="Ask anything about the image")
263
  ask_btn = gr.Button("❓ Ask", variant="secondary")
264
+ qa_output = gr.Textbox(label="Q&A Answer", lines=4, show_copy_button=True)
265
+
266
  with gr.Column(scale=1):
267
+ g1 = gr.Button("πŸ“ Casual Descriptive", variant="primary", size="lg")
268
+ out1 = gr.Textbox(label="Casual Caption", lines=6, show_copy_button=True)
269
+
270
+ g2 = gr.Button("πŸ”₯ Erotic", variant="secondary", size="lg")
271
+ out2 = gr.Textbox(label="Erotic Caption", lines=6, show_copy_button=True)
272
+
273
+ g3 = gr.Button("🎯 Custom Instruction", variant="secondary", size="lg")
274
+ out3 = gr.Textbox(label="Custom Caption", lines=6, show_copy_button=True)
275
+
276
+ export_btn = gr.Button("πŸ“₯ Export All Data", variant="secondary")
277
  export_out = gr.Textbox(visible=False)
278
  export_file = gr.File(visible=False)
279
 
280
+ # Connect buttons
281
+ g1.click(generate_engaging_caption, [image_input, custom_instruction_input], out1)
282
+ g2.click(generate_erotic_caption, [image_input, keywords_input, custom_instruction_input], out2)
283
+ g3.click(generate_custom_caption, [image_input, keywords_input, custom_instruction_input], out3)
284
+ ask_btn.click(ask_question, [image_input, question_input], qa_output)
285
 
286
  def handle_export(k, c, q, e1, e2, e3, qa, img):
287
  msg, fd = export_joycaption_data(k,c,q,e1,e2,e3,qa,img)
 
293
  return gr.update(value=msg,visible=True), gr.update(visible=False)
294
 
295
  export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
 
 
 
296
 
297
+ gr.HTML("<hr><h2>πŸ“‹ Usage Instructions</h2>")
298
+ gr.Markdown("""
299
+ ### **How to Use:**
300
+ 1. **πŸ“Έ Upload an image** in the left panel
301
+ 2. **🎯 Optional**: Add keywords or custom instructions
302
+ 3. **Click caption buttons** to generate different styles
303
+ 4. **❓ Ask questions** about the image using natural language
304
+ 5. **πŸ“₯ Export** all results as JSON
305
+
306
+ ### **Caption Types:**
307
+ - **πŸ“ Casual Descriptive**: Natural, conversational descriptions
308
+ - **πŸ”₯ Erotic**: Explicit adult content descriptions (uses keywords)
309
+ - **🎯 Custom Instruction**: Follows your specific instructions (uses both keywords and custom instruction)
310
+
311
+ ### **✨ Key Improvements:**
312
+ - Uses **proven working prompts** from original JoyCaption spaces
313
+ - **Natural model behavior** without over-engineering
314
+ - **No repetition penalties** that caused glitches
315
+ - **Same parameters** as working reference spaces (temp 0.6, top-p 0.9)
316
+ """)
 
 
 
 
 
317
 
318
  if __name__ == "__main__":
319
  demo.launch()