Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 24, 2025

Commit

6fe2bf5

verified ·

1 Parent(s): ed602d3

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -345

app.py CHANGED Viewed

@@ -13,16 +13,14 @@ except Exception:
         return _wrap
     spaces.GPU = _spaces_gpu
-@spaces.GPU()
-def _joycaption_register_gpu():
-    return None
 import gradio as gr
 import torch
-from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
-import tempfile, gc, os, shutil, json, time, re
 from pathlib import Path
 from hf_space_utils import fix_image_url, postprocess_caption
 # ===== Storage cleanup setup =====
@@ -48,330 +46,190 @@ def cleanup_storage():
 TITLE = """
 <div style="text-align:center;margin:20px 0;">
-<h1>🎨 JoyCaption Three-Tone + Q&A (v3.3 ZeroGPU)</h1>
-<p><strong>✅ Blog-friendly captions • SEO optimized • Longer descriptions • ZeroGPU optimized</strong></p>
 </div><hr>
 """
 print("🚀 Loading JoyCaption model...")
-processor = None
-model = None
-if not os.environ.get("SKIP_MODEL_LOAD"):
-    processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
-    model = LlavaForConditionalGeneration.from_pretrained(
-        MODEL_PATH,
-        torch_dtype=torch.bfloat16,
-        device_map="auto",
-        low_cpu_mem_usage=True
-    )
-    model.eval()
-    print("✅ Model loaded successfully!")
-    cleanup_storage()
-else:
-    print("⚠️ SKIP_MODEL_LOAD=1 → model skipped")
-# ===== BLOG-FRIENDLY PROMPT CONFIGURATIONS =====
 CAPTION_PROMPTS = {
-    "engaging": {
-        "system": "You are a helpful image captioner.",
-        "prompt": "Write a descriptive caption for this image. Describe the scene, people, their appearance, clothing, poses, and setting in a natural, engaging way.",
-        "temperature": 0.7,
-        "top_p": 0.9,
-        "max_tokens": 600
-    },
-    "casual_friend": {
-        "system": "You are an uncensored content specialist who creates detailed, explicit descriptions.",
-        "prompt": "Create a detailed, explicit description of this scene. Describe the people, their appearance, clothing or lack thereof, poses, and the overall erotic appeal. Use direct adult language and be thorough.",
-        "temperature": 0.6,
-        "top_p": 0.9,
-        "max_tokens": 600
-    },
-    "uncensored_keywords": {
-        "system": "You are a helpful image captioner.",
-        "prompt": "Describe this image.",
-        "temperature": 0.6,
-        "top_p": 0.9,
-        "max_tokens": 600
-    }
 }
-def apply_smart_corrections(text):
-    """Apply smart corrections for common AI vision errors"""
-    if not text or not isinstance(text, str):
-        return text
-    corrections = {
-        r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
-        r'\btopless women\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
-        r'\btopless woman\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
-        r'\bwearing nothing\b': 'nude',
-        r'\bnot wearing.*clothes\b': 'nude',
-        r'\bcompletely naked\b': 'nude',
-        r'\bfully nude\b': 'nude',
-    }
-    corrected_text = text
-    try:
-        for pattern, replacement in corrections.items():
-            if callable(replacement):
-                corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
-            else:
-                corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
-    except Exception as e:
-        print(f"Error in corrections: {e}")
-        return text
-    return corrected_text
-def safe_generate_caption_direct(image, tone, max_chars=1000, keywords_text="", custom_instruction=""):
-    """Generate caption using blog-friendly approach"""
-    try:
-        if image is None:
-            return f"❌ No image provided for {tone}"
-        caption_config = CAPTION_PROMPTS.get(tone, CAPTION_PROMPTS["engaging"])
-        base_prompt = caption_config["prompt"]
-        # Handle keywords for casual_friend (erotic) tone
-        if tone == "casual_friend" and keywords_text and keywords_text.strip():
-            base_prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
-        # Handle custom instructions and keywords for uncensored_keywords (third) tone ONLY
-        if tone == "uncensored_keywords":
-            if custom_instruction and custom_instruction.strip():
-                base_prompt += f" Make sure that you mention: {custom_instruction.strip()}"
-            if keywords_text and keywords_text.strip():
-                base_prompt += f" You are allowed to use these keywords: {keywords_text.strip()}"
-        # Use conversation format
-        convo = [
-            {"role": "system", "content": caption_config["system"]},
-            {"role": "user", "content": base_prompt}
-        ]
-        convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
-        if 'pixel_values' in inputs:
-            inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
-        temperature = caption_config.get("temperature", 0.4)
-        top_p = caption_config.get("top_p", 0.8)
-        max_tokens = caption_config.get("max_tokens", 600)
-        with torch.no_grad():
-            output = model.generate(
-                **inputs,
-                max_new_tokens=max_tokens,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=None,
-                repetition_penalty=1.1,
-                use_cache=True,
-                pad_token_id=processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id
-            )
-        if output is None or len(output) == 0:
-            return f"❌ No output generated for {tone}"
-        # Proper decoding
-        if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
-            input_length = inputs['input_ids'].shape[1]
-            if len(output[0]) > input_length:
-                generate_ids = output[0][input_length:]
-                result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-            else:
-                result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        else:
-            result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        result = result.strip()
-        result = apply_smart_corrections(result)
-        # Cleanup after generation
-        del inputs, output
-        gc.collect()
-        # Apply postprocessing
-        final_result = postprocess_caption(result, max_chars=max_chars)
-        return final_result if final_result else f"❌ Empty result for {tone}"
-    except Exception as e:
-        gc.collect()
-        return f"❌ Error: {str(e)[:200]}"
-def safe_generate_custom_prompt(image, system_prompt, user_prompt, max_chars=1000):
-    """Generate caption using custom system and user prompts for playground"""
-    try:
-        if image is None:
-            return "❌ No image provided"
-        if not system_prompt or not system_prompt.strip():
-            return "❌ System prompt is required"
-        if not user_prompt or not user_prompt.strip():
-            return "❌ User prompt is required"
-        # Use custom prompts
-        convo = [
-            {"role": "system", "content": system_prompt.strip()},
-            {"role": "user", "content": user_prompt.strip()}
-        ]
-        convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
-        if 'pixel_values' in inputs:
-            inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
-        with torch.no_grad():
-            output = model.generate(
-                **inputs,
-                max_new_tokens=600,
-                do_sample=True,
-                temperature=0.6,
-                top_p=0.9,
-                top_k=None,
-                repetition_penalty=1.1,
-                use_cache=True,
-                pad_token_id=processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id
-            )
-        if output is None or len(output) == 0:
-            return "❌ No output generated"
-        # Proper decoding
-        if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
-            input_length = inputs['input_ids'].shape[1]
-            if len(output[0]) > input_length:
-                generate_ids = output[0][input_length:]
-                result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-            else:
-                result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        else:
-            result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        result = result.strip()
-        result = apply_smart_corrections(result)
-        # Cleanup after generation
-        del inputs, output
-        gc.collect()
-        # Apply postprocessing
-        final_result = postprocess_caption(result, max_chars=max_chars)
-        return final_result if final_result else "❌ Empty result"
-    except Exception as e:
-        gc.collect()
-        return f"❌ Error: {str(e)[:200]}"
-# Individual GPU-decorated functions for CAPTIONS
-@spaces.GPU(duration=50)
-@torch.no_grad()
-def generate_engaging_only(image, custom_instruction=""):
-    result = safe_generate_caption_direct(image, "engaging", max_chars=1000, custom_instruction=custom_instruction) if image else "❌ Upload image first"
-    gc.collect()
-    return result
-@spaces.GPU(duration=50)
-@torch.no_grad()
-def generate_casual_friend_only(image, keywords_text="", custom_instruction=""):
-    result = safe_generate_caption_direct(image, "casual_friend", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
-    gc.collect()
-    return result
-@spaces.GPU(duration=50)
-@torch.no_grad()
-def generate_uncensored_keywords_only(image, keywords_text="", custom_instruction=""):
-    result = safe_generate_caption_direct(image, "uncensored_keywords", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
-    gc.collect()
-    return result
-# Playground function
-@spaces.GPU(duration=50)
-@torch.no_grad()
-def generate_playground(image, system_prompt, user_prompt):
-    result = safe_generate_custom_prompt(image, system_prompt, user_prompt, max_chars=1000) if image else "❌ Upload image first"
-    gc.collect()
-    return result
-# Separate Q&A function - keep this accurate and focused
-@spaces.GPU(duration=40)
-@torch.no_grad()
-def answer_question(image, question):
-    """Answer questions about the image - focused and accurate"""
     if not image:
         return "❌ Upload image first"
-    if not question or not question.strip():
-        return "❌ Please ask a question"
-    # Short, direct Q&A prompt
-    qa_prompt = f"Answer this question about the image: {question.strip()}"
-    # Simple system message
-    convo = [
-        {"role": "system", "content": "You are a helpful image analyst."},
-        {"role": "user", "content": qa_prompt}
-    ]
-    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
-    device = next(model.parameters()).device
-    inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
-    if 'pixel_values' in inputs:
-        inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
-    with torch.no_grad():
-        output = model.generate(
-            **inputs,
-            max_new_tokens=200,
-            do_sample=True,
-            temperature=0.3,  # Lower temperature for more accurate Q&A
-            top_p=0.8,
-            top_k=None,
-            repetition_penalty=1.1,
-            use_cache=True,
-            pad_token_id=processor.tokenizer.eos_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id
-        )
-    # Decode result
-    if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
-        input_length = inputs['input_ids'].shape[1]
-        if len(output[0]) > input_length:
-            generate_ids = output[0][input_length:]
-            result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        else:
-            result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-    else:
-        result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-    result = result.strip()
-    # Cleanup
-    del inputs, output
-    gc.collect()
-    final_result = postprocess_caption(result, max_chars=300)
     return final_result if final_result else "❌ No answer generated"
-# ===== Export =====
-def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path=""):
     try:
         data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source":"JoyCaption","data":{}}
         if keywords.strip(): data["data"]["keywords"]=keywords.strip()
@@ -382,8 +240,8 @@ def export_joycaption_data(keywords, custom_instructions, question, engaging_cap
             image_url=fix_image_url(image_path, host=(SPACE_HOST or ""))
             if image_url: data["data"]["image_url"]=image_url
         if engaging_caption.strip(): data["data"]["caption_engaging"]=engaging_caption.strip()
-        if casual_caption.strip(): data["data"]["caption_casual_friend"]=casual_caption.strip()
-        if keywords_caption.strip(): data["data"]["caption_keywords"]=keywords_caption.strip()
         if qa_answer.strip(): data["data"]["qa_answer"]=qa_answer.strip()
         if not data["data"]:
             return "❌ No data to export", None
@@ -399,26 +257,31 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="📸 Upload Image", height=400)
-            keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2)
-            custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2)
-            question_input = gr.Textbox(label="❓ Ask Question", lines=2)
             ask_btn = gr.Button("❓ Ask", variant="secondary")
-            qa_output = gr.Textbox(label="Q&A", lines=4, show_copy_button=True)
         with gr.Column(scale=1):
-            g1 = gr.Button("📝 Casual Descriptive", variant="primary")
-            out1 = gr.Textbox(lines=7, show_copy_button=True)
-            g2 = gr.Button("🔥 Erotic", variant="secondary")
-            out2 = gr.Textbox(lines=7, show_copy_button=True)
-            g3 = gr.Button("🎯 Custom Instruction", variant="secondary")
-            out3 = gr.Textbox(lines=7, show_copy_button=True)
-            export_btn = gr.Button("📥 Export All Data")
             export_out = gr.Textbox(visible=False)
             export_file = gr.File(visible=False)
-    g1.click(generate_engaging_only, [image_input, custom_instruction_input], out1)
-    g2.click(generate_casual_friend_only, [image_input, keywords_input, custom_instruction_input], out2)
-    g3.click(generate_uncensored_keywords_only, [image_input, keywords_input, custom_instruction_input], out3)
-    ask_btn.click(answer_question, [image_input, question_input], qa_output)
     def handle_export(k, c, q, e1, e2, e3, qa, img):
         msg, fd = export_joycaption_data(k,c,q,e1,e2,e3,qa,img)
@@ -430,35 +293,27 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
         return gr.update(value=msg,visible=True), gr.update(visible=False)
     export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
-    # ===== PLAYGROUND SECTION =====
-    gr.HTML("<hr><h2>🧪 Playground - Custom Prompts</h2><p>Test custom system and user prompts (not included in JSON export)</p>")
-    with gr.Row():
-        with gr.Column(scale=1):
-            playground_system = gr.Textbox(
-                label="🔧 System Prompt",
-                lines=2,
-                value="You are a helpful image captioner.",
-                placeholder="Enter custom system prompt..."
-            )
-            playground_prompt = gr.Textbox(
-                label="💬 User Prompt",
-                lines=3,
-                value="Describe this image in detail. Include the people, their appearance, clothing, poses, expressions, and the setting.",
-                placeholder="Enter custom user prompt..."
-            )
-            playground_btn = gr.Button("🧪 Generate", variant="secondary")
-        with gr.Column(scale=1):
-            playground_output = gr.Textbox(
-                label="🎯 Playground Output",
-                lines=7,
-                show_copy_button=True,
-                placeholder="Custom prompt results will appear here..."
-            )
-    playground_btn.click(generate_playground, [image_input, playground_system, playground_prompt], playground_output)
 if __name__ == "__main__":
     demo.launch()

         return _wrap
     spaces.GPU = _spaces_gpu
 import gradio as gr
 import torch
+from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
 from PIL import Image
+import tempfile, gc, os, shutil, json, time
 from pathlib import Path
+from threading import Thread
+from typing import Generator
 from hf_space_utils import fix_image_url, postprocess_caption
 # ===== Storage cleanup setup =====
 TITLE = """
 <div style="text-align:center;margin:20px 0;">
+<h1>🎨 JoyCaption Three-Tone + Q&A (v4.0 ZeroGPU)</h1>
+<p><strong>✅ Restored working prompts • Natural model behavior • Proven parameters</strong></p>
 </div><hr>
 """
 print("🚀 Loading JoyCaption model...")
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
+model.eval()
+cleanup_storage()
+print("✅ Model loaded successfully!")
+# ===== WORKING CAPTION PROMPTS (from beta-one space) =====
 CAPTION_PROMPTS = {
+    "engaging": "Write a descriptive caption for this image in a casual tone.",
+    "erotic": "Write a detailed, explicit description of this scene. Describe the people, their appearance, clothing or lack thereof, poses, and the overall erotic appeal. Use direct adult language and be thorough.",
+    "custom": "Write a detailed description for this image."
 }
+@spaces.GPU()
+@torch.no_grad()
+def generate_caption(image: Image.Image, tone: str, keywords_text: str = "", custom_instruction: str = "") -> Generator[str, None, None]:
+    """Generate caption using proven working method from beta-one space"""
+    torch.cuda.empty_cache()
+    if image is None:
+        yield "❌ No image provided"
+        return
+    # Build prompt based on tone (like the working spaces)
+    if tone == "engaging":
+        prompt = CAPTION_PROMPTS["engaging"]
+    elif tone == "erotic":
+        prompt = CAPTION_PROMPTS["erotic"]
+        if keywords_text and keywords_text.strip():
+            prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
+    elif tone == "custom":
+        prompt = CAPTION_PROMPTS["custom"]
+        if custom_instruction and custom_instruction.strip():
+            prompt += f" Make sure that you mention: {custom_instruction.strip()}"
+        if keywords_text and keywords_text.strip():
+            prompt += f" You are allowed to use these keywords: {keywords_text.strip()}"
+    # Use the working system prompt from beta-one space
+    convo = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
+        },
+        {
+            "role": "user",
+            "content": prompt.strip(),
+        },
+    ]
+    # Use the exact same processing as beta-one space
+    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
+    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
+    streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    # Use the exact same parameters as beta-one space (NO repetition penalty!)
+    generate_kwargs = dict(
+        **inputs,
+        max_new_tokens=600,
+        do_sample=True,
+        temperature=0.6,  # Proven working value
+        top_k=None,
+        top_p=0.9,        # Proven working value
+        use_cache=True,
+        streamer=streamer,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+@spaces.GPU()
+@torch.no_grad()
+def answer_question(image: Image.Image, question: str) -> Generator[str, None, None]:
+    """Q&A using proven working method from VQA space"""
+    torch.cuda.empty_cache()
+    if image is None:
+        yield "❌ No image provided"
+        return
+    if not question or not question.strip():
+        yield "❌ Please ask a question"
+        return
+    # Use the exact same approach as the working VQA space
+    convo = [
+        {
+            "role": "system",
+            "content": "You are a helpful image captioner.",  # From VQA space
+        },
+        {
+            "role": "user",
+            "content": question.strip(),  # Direct user input like VQA space
+        },
+    ]
+    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
+    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
+    streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    # Use VQA space parameters (NO repetition penalty!)
+    generate_kwargs = dict(
+        **inputs,
+        max_new_tokens=300,
+        do_sample=True,
+        temperature=0.6,   # From VQA space
+        top_k=None,
+        top_p=0.9,         # From VQA space
+        use_cache=True,
+        streamer=streamer,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+# Wrapper functions for gradio (non-streaming for simplicity)
+def generate_engaging_caption(image, custom_instruction=""):
     if not image:
         return "❌ Upload image first"
+    result = ""
+    for chunk in generate_caption(image, "engaging", custom_instruction=custom_instruction):
+        result = chunk
+    # Apply the same postprocessing
+    final_result = postprocess_caption(result, max_chars=1000)
+    return final_result if final_result else "❌ No result generated"
+def generate_erotic_caption(image, keywords_text="", custom_instruction=""):
+    if not image:
+        return "❌ Upload image first"
+    result = ""
+    for chunk in generate_caption(image, "erotic", keywords_text=keywords_text, custom_instruction=custom_instruction):
+        result = chunk
+    final_result = postprocess_caption(result, max_chars=1000)
+    return final_result if final_result else "❌ No result generated"
+def generate_custom_caption(image, keywords_text="", custom_instruction=""):
+    if not image:
+        return "❌ Upload image first"
+    result = ""
+    for chunk in generate_caption(image, "custom", keywords_text=keywords_text, custom_instruction=custom_instruction):
+        result = chunk
+    final_result = postprocess_caption(result, max_chars=1000)
+    return final_result if final_result else "❌ No result generated"
+def ask_question(image, question):
+    if not image:
+        return "❌ Upload image first"
+    if not question or not question.strip():
+        return "❌ Please ask a question"
+    result = ""
+    for chunk in answer_question(image, question):
+        result = chunk
+    final_result = postprocess_caption(result, max_chars=400)
     return final_result if final_result else "❌ No answer generated"
+# ===== Export function =====
+def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, erotic_caption, custom_caption, qa_answer, image_path=""):
     try:
         data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source":"JoyCaption","data":{}}
         if keywords.strip(): data["data"]["keywords"]=keywords.strip()
             image_url=fix_image_url(image_path, host=(SPACE_HOST or ""))
             if image_url: data["data"]["image_url"]=image_url
         if engaging_caption.strip(): data["data"]["caption_engaging"]=engaging_caption.strip()
+        if erotic_caption.strip(): data["data"]["caption_erotic"]=erotic_caption.strip()
+        if custom_caption.strip(): data["data"]["caption_custom"]=custom_caption.strip()
         if qa_answer.strip(): data["data"]["qa_answer"]=qa_answer.strip()
         if not data["data"]:
             return "❌ No data to export", None
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="📸 Upload Image", height=400)
+            keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2, placeholder="Optional: Keywords for erotic/custom captions")
+            custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2, placeholder="Optional: Custom instruction for third caption")
+            question_input = gr.Textbox(label="❓ Ask Question", lines=2, placeholder="Ask anything about the image")
             ask_btn = gr.Button("❓ Ask", variant="secondary")
+            qa_output = gr.Textbox(label="Q&A Answer", lines=4, show_copy_button=True)
         with gr.Column(scale=1):
+            g1 = gr.Button("📝 Casual Descriptive", variant="primary", size="lg")
+            out1 = gr.Textbox(label="Casual Caption", lines=6, show_copy_button=True)
+            g2 = gr.Button("🔥 Erotic", variant="secondary", size="lg")
+            out2 = gr.Textbox(label="Erotic Caption", lines=6, show_copy_button=True)
+            g3 = gr.Button("🎯 Custom Instruction", variant="secondary", size="lg")
+            out3 = gr.Textbox(label="Custom Caption", lines=6, show_copy_button=True)
+            export_btn = gr.Button("📥 Export All Data", variant="secondary")
             export_out = gr.Textbox(visible=False)
             export_file = gr.File(visible=False)
+    # Connect buttons
+    g1.click(generate_engaging_caption, [image_input, custom_instruction_input], out1)
+    g2.click(generate_erotic_caption, [image_input, keywords_input, custom_instruction_input], out2)
+    g3.click(generate_custom_caption, [image_input, keywords_input, custom_instruction_input], out3)
+    ask_btn.click(ask_question, [image_input, question_input], qa_output)
     def handle_export(k, c, q, e1, e2, e3, qa, img):
         msg, fd = export_joycaption_data(k,c,q,e1,e2,e3,qa,img)
         return gr.update(value=msg,visible=True), gr.update(visible=False)
     export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
+    gr.HTML("<hr><h2>📋 Usage Instructions</h2>")
+    gr.Markdown("""
+    ### **How to Use:**
+    1. **📸 Upload an image** in the left panel
+    2. **🎯 Optional**: Add keywords or custom instructions
+    3. **Click caption buttons** to generate different styles
+    4. **❓ Ask questions** about the image using natural language
+    5. **📥 Export** all results as JSON
+    ### **Caption Types:**
+    - **📝 Casual Descriptive**: Natural, conversational descriptions
+    - **🔥 Erotic**: Explicit adult content descriptions (uses keywords)
+    - **🎯 Custom Instruction**: Follows your specific instructions (uses both keywords and custom instruction)
+    ### **✨ Key Improvements:**
+    - Uses **proven working prompts** from original JoyCaption spaces
+    - **Natural model behavior** without over-engineering
+    - **No repetition penalties** that caused glitches
+    - **Same parameters** as working reference spaces (temp 0.6, top-p 0.9)
+    """)
 if __name__ == "__main__":
     demo.launch()