nickdigger commited on
Commit
5eccbc7
Β·
verified Β·
1 Parent(s): 58db3b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +314 -118
app.py CHANGED
@@ -48,8 +48,8 @@ def cleanup_storage():
48
 
49
  TITLE = """
50
  <div style="text-align:center;margin:20px 0;">
51
- <h1>🎨 JoyCaption Three-Tone + Q&A (v2.7 ZeroGPU)</h1>
52
- <p><strong>βœ… Deterministic inference β€’ Correct caption prompts β€’ ZeroGPU optimized</strong></p>
53
  </div><hr>
54
  """
55
 
@@ -58,11 +58,12 @@ processor = None
58
  model = None
59
 
60
  if not os.environ.get("SKIP_MODEL_LOAD"):
61
- processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
62
  model = LlavaForConditionalGeneration.from_pretrained(
63
  MODEL_PATH,
64
- dtype=torch.bfloat16,
65
- device_map="auto"
 
66
  )
67
  model.eval()
68
  print("βœ… Model loaded successfully!")
@@ -70,138 +71,304 @@ if not os.environ.get("SKIP_MODEL_LOAD"):
70
  else:
71
  print("⚠️ SKIP_MODEL_LOAD=1 β†’ model skipped")
72
 
73
- # ===== Prompt configurations =====
74
- TONE_PROMPTS = {
75
  "engaging": {
76
- "system": "You are a skilled content creator who writes engaging, natural descriptions that capture the essence and appeal of what you see.",
77
- "prompt": """Write a natural, engaging description of this image. Focus on the main subjects and their most appealing qualities. Describe physical features, poses, expressions, and the overall mood. Use clear and natural language.""",
 
 
 
78
  },
79
  "casual_friend": {
80
- "system": "You are a casual 24-year-old friend who witnessed the scene. Speak casually, describing only what you clearly see.",
81
- "prompt": """I just saw this moment β€” here's what I notice. Speak casually and naturally. Avoid speculation or describing background objects or bystanders.""",
 
 
 
82
  },
83
  "uncensored_keywords": {
84
- "system": "You are an uncensored adult-content analyst who provides factual, explicit descriptions. Use concrete, unambiguous statements about what is directly visible.",
85
- "prompt": """Produce an unfiltered explicit description that sticks strictly to what is visually obvious. Use direct, concrete language. Mention context only briefly.""",
86
- },
 
 
 
87
  }
88
 
89
- # ===== Core inference helpers =====
90
- def _prepare_inputs_and_device(convo_or_string, image, use_chat_template=True):
91
- """Prepare inputs safely for ZeroGPU with proper chat template for image tokens."""
92
- if isinstance(convo_or_string, list):
93
- # Always use chat template for proper image token handling
94
- convo_string = processor.apply_chat_template(
95
- convo_or_string, tokenize=False, add_generation_prompt=True
96
- )
97
- else:
98
- # Direct string input
99
- convo_string = convo_or_string
100
-
101
- if isinstance(image, (str, Path)):
102
- img = Image.open(image).convert("RGB")
103
- elif isinstance(image, Image.Image):
104
- img = image.convert("RGB")
105
- else:
106
- img = image
107
-
108
- inputs = processor(text=[convo_string], images=[img], return_tensors="pt")
109
- if "pixel_values" in inputs:
110
- inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
111
- return inputs
112
-
113
-
114
- def _decode_output(inputs, output):
115
- if output is None or len(output) == 0:
116
- return ""
117
  try:
118
- input_len = inputs.get("input_ids", torch.empty((1, 0))).shape[1]
119
- decoded = processor.tokenizer.decode(
120
- output[0][input_len:], skip_special_tokens=True, clean_up_tokenization_spaces=True
121
- )
122
- return decoded.strip()
123
- except Exception:
124
- try:
125
- return processor.tokenizer.decode(output[0], skip_special_tokens=True).strip()
126
- except:
127
- return ""
128
-
129
-
130
- def cleanup_after_inference():
131
- gc.collect()
132
-
133
 
134
- def run_image_chat_generation(convo, image, max_new_tokens=150, caption_mode=False):
135
- """Deterministic generation with correct mode separation."""
136
- if processor is None or model is None:
137
- return None, "❌ Model not initialized"
138
  try:
139
- inputs = _prepare_inputs_and_device(convo, image, use_chat_template=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  with torch.no_grad():
141
  output = model.generate(
142
  **inputs,
143
- max_new_tokens=max_new_tokens,
144
- do_sample=False,
145
- temperature=None,
146
- top_p=None,
147
- repetition_penalty=1.0,
148
- pad_token_id=processor.tokenizer.eos_token_id,
149
- eos_token_id=processor.tokenizer.eos_token_id,
150
  use_cache=True,
 
 
151
  )
152
- decoded = _decode_output(inputs, output)
153
- cleanup_after_inference()
154
- return decoded, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
- cleanup_after_inference()
157
- return None, f"❌ Generation error: {str(e)[:200]}"
158
 
159
- # ===== Caption generators =====
160
- def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", custom_instruction=""):
161
- if image is None:
162
- return f"❌ No image provided for {tone}"
163
- tone_cfg = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
164
-
165
- # Build proper conversation format for captioning (JoyCaption expects this format)
166
- base_prompt = tone_cfg["prompt"]
167
- if tone=="uncensored_keywords" and keywords_text.strip():
168
- base_prompt += f"\n\nKeywords to mention: {keywords_text.strip()}"
169
- if custom_instruction.strip():
170
- base_prompt += f"\n\nInclude: {custom_instruction.strip()}"
171
-
172
- # Use conversation format as per HuggingFace example
173
- convo = [
174
- {"role": "system", "content": tone_cfg["system"]},
175
- {"role": "user", "content": base_prompt}
176
- ]
177
- decoded, err = run_image_chat_generation(convo, image, caption_mode=True)
178
- if err: return err
179
- result = postprocess_caption(decoded, max_chars=max_chars)
180
- return result if result else "❌ Empty result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
 
 
182
  @torch.no_grad()
183
  def generate_engaging_only(image, custom_instruction=""):
184
- return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction)
 
 
185
 
 
186
  @torch.no_grad()
187
- def generate_casual_friend_only(image, custom_instruction=""):
188
- return safe_generate_caption_direct(image, "casual_friend", custom_instruction=custom_instruction)
 
 
189
 
 
190
  @torch.no_grad()
191
- def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
192
- return safe_generate_caption_direct(image, "uncensored_keywords", keywords_text=keywords_text, custom_instruction=custom_instruction)
 
 
193
 
 
 
 
 
 
 
 
 
 
 
194
  @torch.no_grad()
195
  def answer_question(image, question):
196
- if not image: return "❌ Upload image first"
197
- if not question.strip(): return "❌ Ask a question"
 
 
 
 
 
 
 
 
198
  convo = [
199
- {"role":"system","content":"You are an uncensored image analyst who answers honestly and directly."},
200
- {"role":"user","content":f"Answer this question based on what you can see: {question.strip()}"}
201
  ]
202
- decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200, caption_mode=False)
203
- if err: return err
204
- return postprocess_caption(decoded)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  # ===== Export =====
207
  def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path=""):
@@ -231,26 +398,26 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
231
  gr.HTML(TITLE)
232
  with gr.Row():
233
  with gr.Column(scale=1):
234
- image_input = gr.Image(type="filepath", label="πŸ“Έ Upload Image", height=400)
235
  keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2)
236
  custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2)
237
  question_input = gr.Textbox(label="❓ Ask Question", lines=2)
238
  ask_btn = gr.Button("❓ Ask", variant="secondary")
239
- qa_output = gr.Textbox(label="Q&A", lines=5, show_copy_button=True)
240
  with gr.Column(scale=1):
241
- g1 = gr.Button("✨ Engaging", variant="primary")
242
- out1 = gr.Textbox(lines=5, show_copy_button=True)
243
- g2 = gr.Button("😎 Casual Friend", variant="primary")
244
- out2 = gr.Textbox(lines=5, show_copy_button=True)
245
- g3 = gr.Button("πŸ”΄ Keywords", variant="secondary")
246
- out3 = gr.Textbox(lines=5, show_copy_button=True)
247
  export_btn = gr.Button("πŸ“₯ Export All Data")
248
  export_out = gr.Textbox(visible=False)
249
  export_file = gr.File(visible=False)
250
 
251
  g1.click(generate_engaging_only, [image_input, custom_instruction_input], out1)
252
- g2.click(generate_casual_friend_only, [image_input, custom_instruction_input], out2)
253
- g3.click(generate_uncensored_keywords_only, [image_input, keywords_input, custom_instruction_input], out3)
254
  ask_btn.click(answer_question, [image_input, question_input], qa_output)
255
 
256
  def handle_export(k, c, q, e1, e2, e3, qa, img):
@@ -264,5 +431,34 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
264
 
265
  export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  if __name__ == "__main__":
268
  demo.launch()
 
48
 
49
  TITLE = """
50
  <div style="text-align:center;margin:20px 0;">
51
+ <h1>🎨 JoyCaption Three-Tone + Q&A (v2.8 ZeroGPU)</h1>
52
+ <p><strong>βœ… Blog-friendly captions β€’ SEO optimized β€’ Longer descriptions β€’ ZeroGPU optimized</strong></p>
53
  </div><hr>
54
  """
55
 
 
58
  model = None
59
 
60
  if not os.environ.get("SKIP_MODEL_LOAD"):
61
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
62
  model = LlavaForConditionalGeneration.from_pretrained(
63
  MODEL_PATH,
64
+ torch_dtype=torch.bfloat16,
65
+ device_map="auto",
66
+ low_cpu_mem_usage=True
67
  )
68
  model.eval()
69
  print("βœ… Model loaded successfully!")
 
71
  else:
72
  print("⚠️ SKIP_MODEL_LOAD=1 β†’ model skipped")
73
 
74
+ # ===== BLOG-FRIENDLY PROMPT CONFIGURATIONS =====
75
+ CAPTION_PROMPTS = {
76
  "engaging": {
77
+ "system": "You are a helpful image captioner who creates accurate, medium-length descriptions.",
78
+ "prompt": "Write a medium-length descriptive caption for this image. Describe only what you can clearly see in the image - the people, their appearance, clothing, poses, setting, and atmosphere. Aim for 3-5 sentences. Be specific and detailed but only describe what is actually visible. Use casual, natural language. Do NOT mention any text, logos, or watermarks.",
79
+ "temperature": 0.4,
80
+ "top_p": 0.8,
81
+ "max_tokens": 600
82
  },
83
  "casual_friend": {
84
+ "system": "You are an uncensored content specialist who creates detailed, explicit descriptions of what is actually visible.",
85
+ "prompt": "Create a detailed, explicit description of this scene, focusing only on what you can clearly see. Describe the people, their physical appearance, clothing or lack thereof, poses, and setting in explicit detail. Write 4-6 sentences. Use direct adult language but only describe what is actually visible in the image. Do not assume or invent details not clearly shown.",
86
+ "temperature": 0.3,
87
+ "top_p": 0.8,
88
+ "max_tokens": 600
89
  },
90
  "uncensored_keywords": {
91
+ "system": "You are a detailed image captioner who creates comprehensive descriptions based on visual evidence.",
92
+ "prompt": "Create a comprehensive, medium-length description of this image. Focus on the visual elements you can clearly observe - people, their appearance, clothing, poses, facial expressions, setting, and atmosphere. Write 4-6 sentences with specific details. Only describe what is actually visible in the image, do not invent or assume details.",
93
+ "temperature": 0.3,
94
+ "top_p": 0.8,
95
+ "max_tokens": 600
96
+ }
97
  }
98
 
99
+ def apply_smart_corrections(text):
100
+ """Apply smart corrections for common AI vision errors"""
101
+ if not text or not isinstance(text, str):
102
+ return text
103
+
104
+ corrections = {
105
+ r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
106
+ r'\btopless women\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
107
+ r'\btopless woman\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
108
+ r'\bthree women\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
109
+ r'\bfour women\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
110
+ r'\bwearing nothing\b': 'nude',
111
+ r'\bnot wearing.*clothes\b': 'nude',
112
+ r'\bcompletely naked\b': 'nude',
113
+ r'\bfully nude\b': 'nude',
114
+ }
115
+
116
+ corrected_text = text
 
 
 
 
 
 
 
 
 
 
117
  try:
118
+ for pattern, replacement in corrections.items():
119
+ if callable(replacement):
120
+ corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
121
+ else:
122
+ corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
123
+ except Exception as e:
124
+ print(f"Error in corrections: {e}")
125
+ return text
126
+
127
+ return corrected_text
 
 
 
 
 
128
 
129
+ def safe_generate_caption_direct(image, tone, max_chars=1000, keywords_text="", custom_instruction=""):
130
+ """Generate caption using blog-friendly approach"""
 
 
131
  try:
132
+ if image is None:
133
+ return f"❌ No image provided for {tone}"
134
+
135
+ caption_config = CAPTION_PROMPTS.get(tone, CAPTION_PROMPTS["engaging"])
136
+ base_prompt = caption_config["prompt"]
137
+
138
+ # Handle keywords for casual_friend (erotic) tone
139
+ if tone == "casual_friend" and keywords_text and keywords_text.strip():
140
+ base_prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
141
+
142
+ # Handle custom instructions for uncensored_keywords (third) tone
143
+ if tone == "uncensored_keywords" and custom_instruction and custom_instruction.strip():
144
+ base_prompt += f" Make sure that you mention: {custom_instruction.strip()}"
145
+
146
+ # Handle custom instructions for other tones
147
+ if tone != "uncensored_keywords" and custom_instruction and custom_instruction.strip():
148
+ base_prompt += f" Also focus on: {custom_instruction.strip()}"
149
+
150
+ # Use conversation format
151
+ convo = [
152
+ {"role": "system", "content": caption_config["system"]},
153
+ {"role": "user", "content": base_prompt}
154
+ ]
155
+
156
+ convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
157
+ inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
158
+
159
+ device = next(model.parameters()).device
160
+ inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
161
+
162
+ if 'pixel_values' in inputs:
163
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
164
+
165
+ temperature = caption_config.get("temperature", 0.4)
166
+ top_p = caption_config.get("top_p", 0.8)
167
+ max_tokens = caption_config.get("max_tokens", 600)
168
+
169
  with torch.no_grad():
170
  output = model.generate(
171
  **inputs,
172
+ max_new_tokens=max_tokens,
173
+ do_sample=True,
174
+ temperature=temperature,
175
+ top_p=top_p,
176
+ top_k=None,
 
 
177
  use_cache=True,
178
+ pad_token_id=processor.tokenizer.eos_token_id,
179
+ eos_token_id=processor.tokenizer.eos_token_id
180
  )
181
+
182
+ if output is None or len(output) == 0:
183
+ return f"❌ No output generated for {tone}"
184
+
185
+ # Proper decoding
186
+ if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
187
+ input_length = inputs['input_ids'].shape[1]
188
+ if len(output[0]) > input_length:
189
+ generate_ids = output[0][input_length:]
190
+ result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
191
+ else:
192
+ result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
193
+ else:
194
+ result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
195
+
196
+ result = result.strip()
197
+ result = apply_smart_corrections(result)
198
+
199
+ # Cleanup after generation
200
+ del inputs, output
201
+ gc.collect()
202
+
203
+ # Apply postprocessing
204
+ final_result = postprocess_caption(result, max_chars=max_chars)
205
+
206
+ return final_result if final_result else f"❌ Empty result for {tone}"
207
+
208
  except Exception as e:
209
+ gc.collect()
210
+ return f"❌ Error: {str(e)[:200]}"
211
 
212
+ def safe_generate_custom_prompt(image, system_prompt, user_prompt, max_chars=1000):
213
+ """Generate caption using custom system and user prompts for playground"""
214
+ try:
215
+ if image is None:
216
+ return "❌ No image provided"
217
+
218
+ if not system_prompt or not system_prompt.strip():
219
+ return "❌ System prompt is required"
220
+
221
+ if not user_prompt or not user_prompt.strip():
222
+ return "❌ User prompt is required"
223
+
224
+ # Use custom prompts
225
+ convo = [
226
+ {"role": "system", "content": system_prompt.strip()},
227
+ {"role": "user", "content": user_prompt.strip()}
228
+ ]
229
+
230
+ convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
231
+ inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
232
+
233
+ device = next(model.parameters()).device
234
+ inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
235
+
236
+ if 'pixel_values' in inputs:
237
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
238
+
239
+ with torch.no_grad():
240
+ output = model.generate(
241
+ **inputs,
242
+ max_new_tokens=600,
243
+ do_sample=True,
244
+ temperature=0.5,
245
+ top_p=0.8,
246
+ top_k=None,
247
+ use_cache=True,
248
+ pad_token_id=processor.tokenizer.eos_token_id,
249
+ eos_token_id=processor.tokenizer.eos_token_id
250
+ )
251
+
252
+ if output is None or len(output) == 0:
253
+ return "❌ No output generated"
254
+
255
+ # Proper decoding
256
+ if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
257
+ input_length = inputs['input_ids'].shape[1]
258
+ if len(output[0]) > input_length:
259
+ generate_ids = output[0][input_length:]
260
+ result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
261
+ else:
262
+ result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
263
+ else:
264
+ result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
265
+
266
+ result = result.strip()
267
+ result = apply_smart_corrections(result)
268
+
269
+ # Cleanup after generation
270
+ del inputs, output
271
+ gc.collect()
272
+
273
+ # Apply postprocessing
274
+ final_result = postprocess_caption(result, max_chars=max_chars)
275
+
276
+ return final_result if final_result else "❌ Empty result"
277
+
278
+ except Exception as e:
279
+ gc.collect()
280
+ return f"❌ Error: {str(e)[:200]}"
281
 
282
+ # Individual GPU-decorated functions for CAPTIONS
283
+ @spaces.GPU(duration=50)
284
  @torch.no_grad()
285
  def generate_engaging_only(image, custom_instruction=""):
286
+ result = safe_generate_caption_direct(image, "engaging", max_chars=1000, custom_instruction=custom_instruction) if image else "❌ Upload image first"
287
+ gc.collect()
288
+ return result
289
 
290
+ @spaces.GPU(duration=50)
291
  @torch.no_grad()
292
+ def generate_casual_friend_only(image, keywords_text="", custom_instruction=""):
293
+ result = safe_generate_caption_direct(image, "casual_friend", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
294
+ gc.collect()
295
+ return result
296
 
297
+ @spaces.GPU(duration=50)
298
  @torch.no_grad()
299
+ def generate_uncensored_keywords_only(image, custom_instruction=""):
300
+ result = safe_generate_caption_direct(image, "uncensored_keywords", max_chars=1000, custom_instruction=custom_instruction) if image else "❌ Upload image first"
301
+ gc.collect()
302
+ return result
303
 
304
+ # Playground function
305
+ @spaces.GPU(duration=50)
306
+ @torch.no_grad()
307
+ def generate_playground(image, system_prompt, user_prompt):
308
+ result = safe_generate_custom_prompt(image, system_prompt, user_prompt, max_chars=1000) if image else "❌ Upload image first"
309
+ gc.collect()
310
+ return result
311
+
312
+ # Separate Q&A function - keep this accurate and focused
313
+ @spaces.GPU(duration=40)
314
  @torch.no_grad()
315
  def answer_question(image, question):
316
+ """Answer questions about the image - focused and accurate"""
317
+ if not image:
318
+ return "❌ Upload image first"
319
+ if not question or not question.strip():
320
+ return "❌ Please ask a question"
321
+
322
+ # Short, direct Q&A prompt
323
+ qa_prompt = f"Answer this question about the image: {question.strip()}"
324
+
325
+ # Simple system message
326
  convo = [
327
+ {"role": "system", "content": "You are a helpful image analyst."},
328
+ {"role": "user", "content": qa_prompt}
329
  ]
330
+
331
+ convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
332
+ inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
333
+
334
+ device = next(model.parameters()).device
335
+ inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
336
+
337
+ if 'pixel_values' in inputs:
338
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
339
+
340
+ with torch.no_grad():
341
+ output = model.generate(
342
+ **inputs,
343
+ max_new_tokens=200,
344
+ do_sample=True,
345
+ temperature=0.3, # Lower temperature for more accurate Q&A
346
+ top_p=0.8,
347
+ top_k=None,
348
+ use_cache=True,
349
+ pad_token_id=processor.tokenizer.eos_token_id,
350
+ eos_token_id=processor.tokenizer.eos_token_id
351
+ )
352
+
353
+ # Decode result
354
+ if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
355
+ input_length = inputs['input_ids'].shape[1]
356
+ if len(output[0]) > input_length:
357
+ generate_ids = output[0][input_length:]
358
+ result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
359
+ else:
360
+ result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
361
+ else:
362
+ result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
363
+
364
+ result = result.strip()
365
+
366
+ # Cleanup
367
+ del inputs, output
368
+ gc.collect()
369
+
370
+ final_result = postprocess_caption(result, max_chars=300)
371
+ return final_result if final_result else "❌ No answer generated"
372
 
373
  # ===== Export =====
374
  def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path=""):
 
398
  gr.HTML(TITLE)
399
  with gr.Row():
400
  with gr.Column(scale=1):
401
+ image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image", height=400)
402
  keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2)
403
  custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2)
404
  question_input = gr.Textbox(label="❓ Ask Question", lines=2)
405
  ask_btn = gr.Button("❓ Ask", variant="secondary")
406
+ qa_output = gr.Textbox(label="Q&A", lines=4, show_copy_button=True)
407
  with gr.Column(scale=1):
408
+ g1 = gr.Button("πŸ“ Casual Descriptive", variant="primary")
409
+ out1 = gr.Textbox(lines=7, show_copy_button=True)
410
+ g2 = gr.Button("πŸ”₯ Erotic", variant="secondary")
411
+ out2 = gr.Textbox(lines=7, show_copy_button=True)
412
+ g3 = gr.Button("🎯 Custom Instruction", variant="secondary")
413
+ out3 = gr.Textbox(lines=7, show_copy_button=True)
414
  export_btn = gr.Button("πŸ“₯ Export All Data")
415
  export_out = gr.Textbox(visible=False)
416
  export_file = gr.File(visible=False)
417
 
418
  g1.click(generate_engaging_only, [image_input, custom_instruction_input], out1)
419
+ g2.click(generate_casual_friend_only, [image_input, keywords_input, custom_instruction_input], out2)
420
+ g3.click(generate_uncensored_keywords_only, [image_input, custom_instruction_input], out3)
421
  ask_btn.click(answer_question, [image_input, question_input], qa_output)
422
 
423
  def handle_export(k, c, q, e1, e2, e3, qa, img):
 
431
 
432
  export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
433
 
434
+ # ===== PLAYGROUND SECTION =====
435
+ gr.HTML("<hr><h2>πŸ§ͺ Playground - Custom Prompts</h2><p>Test custom system and user prompts (not included in JSON export)</p>")
436
+
437
+ with gr.Row():
438
+ with gr.Column(scale=1):
439
+ playground_system = gr.Textbox(
440
+ label="πŸ”§ System Prompt",
441
+ lines=2,
442
+ value="You are a helpful image captioner who creates accurate, detailed descriptions based only on what is clearly visible.",
443
+ placeholder="Enter custom system prompt..."
444
+ )
445
+ playground_prompt = gr.Textbox(
446
+ label="πŸ’¬ User Prompt",
447
+ lines=3,
448
+ value="Describe what you can clearly see in this image. Focus on the people present, their hair colors, body types, facial expressions, clothing or lack thereof, poses, and actions. Write 4-6 sentences with specific details. Only describe what is actually visible - do not assume or invent details not clearly shown.",
449
+ placeholder="Enter custom user prompt..."
450
+ )
451
+ playground_btn = gr.Button("πŸ§ͺ Generate", variant="secondary")
452
+
453
+ with gr.Column(scale=1):
454
+ playground_output = gr.Textbox(
455
+ label="🎯 Playground Output",
456
+ lines=7,
457
+ show_copy_button=True,
458
+ placeholder="Custom prompt results will appear here..."
459
+ )
460
+
461
+ playground_btn.click(generate_playground, [image_input, playground_system, playground_prompt], playground_output)
462
+
463
  if __name__ == "__main__":
464
  demo.launch()