Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 24, 2025

Commit

989cc3a

verified ·

1 Parent(s): 6fe2bf5

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -206

app.py CHANGED Viewed

@@ -15,18 +15,40 @@ except Exception:
 import gradio as gr
 import torch
-from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
 from PIL import Image
 import tempfile, gc, os, shutil, json, time
 from pathlib import Path
-from threading import Thread
-from typing import Generator
 from hf_space_utils import fix_image_url, postprocess_caption
 # ===== Storage cleanup setup =====
 _tmpdir = tempfile.gettempdir()
 os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
-os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
 os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
@@ -46,189 +68,200 @@ def cleanup_storage():
 TITLE = """
 <div style="text-align:center;margin:20px 0;">
-<h1>🎨 JoyCaption Three-Tone + Q&A (v4.0 ZeroGPU)</h1>
-<p><strong>✅ Restored working prompts • Natural model behavior • Proven parameters</strong></p>
 </div><hr>
 """
-print("🚀 Loading JoyCaption model...")
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
 model.eval()
 cleanup_storage()
 print("✅ Model loaded successfully!")
-# ===== WORKING CAPTION PROMPTS (from beta-one space) =====
-CAPTION_PROMPTS = {
-    "engaging": "Write a descriptive caption for this image in a casual tone.",
-    "erotic": "Write a detailed, explicit description of this scene. Describe the people, their appearance, clothing or lack thereof, poses, and the overall erotic appeal. Use direct adult language and be thorough.",
-    "custom": "Write a detailed description for this image."
-}
-@spaces.GPU()
-@torch.no_grad()
-def generate_caption(image: Image.Image, tone: str, keywords_text: str = "", custom_instruction: str = "") -> Generator[str, None, None]:
-    """Generate caption using proven working method from beta-one space"""
-    torch.cuda.empty_cache()
-    if image is None:
-        yield "❌ No image provided"
-        return
-    # Build prompt based on tone (like the working spaces)
-    if tone == "engaging":
-        prompt = CAPTION_PROMPTS["engaging"]
-    elif tone == "erotic":
-        prompt = CAPTION_PROMPTS["erotic"]
-        if keywords_text and keywords_text.strip():
-            prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
-    elif tone == "custom":
-        prompt = CAPTION_PROMPTS["custom"]
-        if custom_instruction and custom_instruction.strip():
-            prompt += f" Make sure that you mention: {custom_instruction.strip()}"
-        if keywords_text and keywords_text.strip():
-            prompt += f" You are allowed to use these keywords: {keywords_text.strip()}"
-    # Use the working system prompt from beta-one space
-    convo = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
-        },
-        {
-            "role": "user",
-            "content": prompt.strip(),
-        },
-    ]
-    # Use the exact same processing as beta-one space
-    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
-    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
-    streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    # Use the exact same parameters as beta-one space (NO repetition penalty!)
-    generate_kwargs = dict(
-        **inputs,
-        max_new_tokens=600,
-        do_sample=True,
-        temperature=0.6,  # Proven working value
-        top_k=None,
-        top_p=0.9,        # Proven working value
-        use_cache=True,
-        streamer=streamer,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
-@spaces.GPU()
-@torch.no_grad()
-def answer_question(image: Image.Image, question: str) -> Generator[str, None, None]:
-    """Q&A using proven working method from VQA space"""
-    torch.cuda.empty_cache()
-    if image is None:
-        yield "❌ No image provided"
-        return
-    if not question or not question.strip():
-        yield "❌ Please ask a question"
-        return
-    # Use the exact same approach as the working VQA space
-    convo = [
-        {
-            "role": "system",
-            "content": "You are a helpful image captioner.",  # From VQA space
-        },
-        {
-            "role": "user",
-            "content": question.strip(),  # Direct user input like VQA space
-        },
-    ]
-    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
-    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
-    streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    # Use VQA space parameters (NO repetition penalty!)
-    generate_kwargs = dict(
-        **inputs,
-        max_new_tokens=300,
-        do_sample=True,
-        temperature=0.6,   # From VQA space
-        top_k=None,
-        top_p=0.9,         # From VQA space
-        use_cache=True,
-        streamer=streamer,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
-# Wrapper functions for gradio (non-streaming for simplicity)
-def generate_engaging_caption(image, custom_instruction=""):
-    if not image:
-        return "❌ Upload image first"
-    result = ""
-    for chunk in generate_caption(image, "engaging", custom_instruction=custom_instruction):
-        result = chunk
-    # Apply the same postprocessing
-    final_result = postprocess_caption(result, max_chars=1000)
-    return final_result if final_result else "❌ No result generated"
-def generate_erotic_caption(image, keywords_text="", custom_instruction=""):
-    if not image:
-        return "❌ Upload image first"
-    result = ""
-    for chunk in generate_caption(image, "erotic", keywords_text=keywords_text, custom_instruction=custom_instruction):
-        result = chunk
-    final_result = postprocess_caption(result, max_chars=1000)
-    return final_result if final_result else "❌ No result generated"
-def generate_custom_caption(image, keywords_text="", custom_instruction=""):
-    if not image:
-        return "❌ Upload image first"
-    result = ""
-    for chunk in generate_caption(image, "custom", keywords_text=keywords_text, custom_instruction=custom_instruction):
-        result = chunk
-    final_result = postprocess_caption(result, max_chars=1000)
-    return final_result if final_result else "❌ No result generated"
-def ask_question(image, question):
     if not image:
         return "❌ Upload image first"
     if not question or not question.strip():
         return "❌ Please ask a question"
-    result = ""
-    for chunk in answer_question(image, question):
-        result = chunk
-    final_result = postprocess_caption(result, max_chars=400)
-    return final_result if final_result else "❌ No answer generated"
-# ===== Export function =====
 def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, erotic_caption, custom_caption, qa_answer, image_path=""):
     try:
         data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source":"JoyCaption","data":{}}
@@ -257,31 +290,26 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="📸 Upload Image", height=400)
-            keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2, placeholder="Optional: Keywords for erotic/custom captions")
-            custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2, placeholder="Optional: Custom instruction for third caption")
-            question_input = gr.Textbox(label="❓ Ask Question", lines=2, placeholder="Ask anything about the image")
             ask_btn = gr.Button("❓ Ask", variant="secondary")
-            qa_output = gr.Textbox(label="Q&A Answer", lines=4, show_copy_button=True)
         with gr.Column(scale=1):
-            g1 = gr.Button("📝 Casual Descriptive", variant="primary", size="lg")
-            out1 = gr.Textbox(label="Casual Caption", lines=6, show_copy_button=True)
-            g2 = gr.Button("🔥 Erotic", variant="secondary", size="lg")
-            out2 = gr.Textbox(label="Erotic Caption", lines=6, show_copy_button=True)
-            g3 = gr.Button("🎯 Custom Instruction", variant="secondary", size="lg")
-            out3 = gr.Textbox(label="Custom Caption", lines=6, show_copy_button=True)
-            export_btn = gr.Button("📥 Export All Data", variant="secondary")
             export_out = gr.Textbox(visible=False)
             export_file = gr.File(visible=False)
-    # Connect buttons
-    g1.click(generate_engaging_caption, [image_input, custom_instruction_input], out1)
-    g2.click(generate_erotic_caption, [image_input, keywords_input, custom_instruction_input], out2)
-    g3.click(generate_custom_caption, [image_input, keywords_input, custom_instruction_input], out3)
-    ask_btn.click(ask_question, [image_input, question_input], qa_output)
     def handle_export(k, c, q, e1, e2, e3, qa, img):
         msg, fd = export_joycaption_data(k,c,q,e1,e2,e3,qa,img)
@@ -293,26 +321,18 @@ with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as d
         return gr.update(value=msg,visible=True), gr.update(visible=False)
     export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
-    gr.HTML("<hr><h2>📋 Usage Instructions</h2>")
     gr.Markdown("""
-    ### **How to Use:**
-    1. **📸 Upload an image** in the left panel
-    2. **🎯 Optional**: Add keywords or custom instructions
-    3. **Click caption buttons** to generate different styles
-    4. **❓ Ask questions** about the image using natural language
-    5. **📥 Export** all results as JSON
-    ### **Caption Types:**
-    - **📝 Casual Descriptive**: Natural, conversational descriptions
-    - **🔥 Erotic**: Explicit adult content descriptions (uses keywords)
-    - **🎯 Custom Instruction**: Follows your specific instructions (uses both keywords and custom instruction)
-    ### **✨ Key Improvements:**
-    - Uses **proven working prompts** from original JoyCaption spaces
-    - **Natural model behavior** without over-engineering
-    - **No repetition penalties** that caused glitches
-    - **Same parameters** as working reference spaces (temp 0.6, top-p 0.9)
     """)
 if __name__ == "__main__":

 import gradio as gr
 import torch
+from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import tempfile, gc, os, shutil, json, time
 from pathlib import Path
 from hf_space_utils import fix_image_url, postprocess_caption
+# ===== AGGRESSIVE CACHE CLEARING =====
+def force_clear_all_caches():
+    """Force clear all possible caches"""
+    try:
+        # Clear CUDA cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        # Clear Python cache
+        gc.collect()
+        # Clear transformers cache
+        from transformers.utils import TRANSFORMERS_CACHE
+        if os.path.exists(TRANSFORMERS_CACHE):
+            shutil.rmtree(TRANSFORMERS_CACHE, ignore_errors=True)
+        print("🧹 All caches cleared!")
+    except Exception as e:
+        print(f"⚠️ Cache clear warning: {e}")
+# Force clear at startup
+force_clear_all_caches()
 # ===== Storage cleanup setup =====
 _tmpdir = tempfile.gettempdir()
 os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
+os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
 os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
 TITLE = """
 <div style="text-align:center;margin:20px 0;">
+<h1>🎨 JoyCaption Three-Tone + Q&A (v4.1 - Cache Cleared)</h1>
+<p><strong>🧹 Force cleared all caches • Fresh model load • Reset state</strong></p>
 </div><hr>
 """
+print("🚀 Loading JoyCaption model with fresh cache...")
+# Force reload with no cache
+processor = AutoProcessor.from_pretrained(MODEL_PATH, cache_dir=None, force_download=False)
+model = LlavaForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    cache_dir=None,
+    force_download=False,
+    low_cpu_mem_usage=True
+)
 model.eval()
 cleanup_storage()
 print("✅ Model loaded successfully!")
+def apply_smart_corrections(text):
+    """Apply minimal corrections only"""
+    if not text or not isinstance(text, str):
+        return text
+    # Only remove leading "a photo of" type phrases - nothing else
+    import re
+    text = re.sub(r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*', '', text, flags=re.IGNORECASE)
+    return text
+def safe_generate_caption_direct(image, tone, max_chars=1000, keywords_text="", custom_instruction=""):
+    """Generate caption using simple, direct approach"""
+    try:
+        if image is None:
+            return f"❌ No image provided for {tone}"
+        # Clear cache before each generation
+        torch.cuda.empty_cache()
+        gc.collect()
+        # VERY SIMPLE PROMPTS (like original working spaces)
+        if tone == "engaging":
+            base_prompt = "Write a descriptive caption for this image in a casual tone."
+        elif tone == "erotic":
+            base_prompt = "Create a detailed, explicit description of this scene. Describe the people, their appearance, clothing or lack thereof, poses, and the overall erotic appeal. Use direct adult language and be thorough."
+            if keywords_text and keywords_text.strip():
+                base_prompt += f" Pay special attention to these elements if present: {keywords_text.strip()}"
+        elif tone == "custom":
+            base_prompt = "Write a detailed description for this image."
+            if custom_instruction and custom_instruction.strip():
+                base_prompt += f" Make sure that you mention: {custom_instruction.strip()}"
+            if keywords_text and keywords_text.strip():
+                base_prompt += f" You are allowed to use these keywords: {keywords_text.strip()}"
+        # Use EXACT same conversation format as working beta-one space
+        convo = [
+            {"role": "system", "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions."},
+            {"role": "user", "content": base_prompt}
+        ]
+        # Process exactly like working space
+        convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to("cuda")
+        inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
+        # EXACT parameters from working beta-one space
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=512,        # Same as beta-one default
+                do_sample=True,
+                temperature=0.6,           # Exact from beta-one
+                top_p=0.9,                # Exact from beta-one
+                top_k=None,               # Same as beta-one
+                use_cache=True,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
+                # NO repetition_penalty like working spaces!
+            )
+        if output is None or len(output) == 0:
+            return f"❌ No output generated for {tone}"
+        # Decode properly
+        if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
+            input_length = inputs['input_ids'].shape[1]
+            if len(output[0]) > input_length:
+                generate_ids = output[0][input_length:]
+                result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            else:
+                result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        else:
+            result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        result = result.strip()
+        result = apply_smart_corrections(result)  # Minimal corrections only
+        # Cleanup
+        del inputs, output
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Apply postprocessing
+        final_result = postprocess_caption(result, max_chars=max_chars)
+        return final_result if final_result else f"❌ Empty result for {tone}"
+    except Exception as e:
+        torch.cuda.empty_cache()
+        gc.collect()
+        return f"❌ Error: {str(e)[:200]}"
+# Individual functions for each button
+@spaces.GPU(duration=60)
+@torch.no_grad()
+def generate_engaging_only(image, custom_instruction=""):
+    result = safe_generate_caption_direct(image, "engaging", max_chars=1000, custom_instruction=custom_instruction) if image else "❌ Upload image first"
+    return result
+@spaces.GPU(duration=60)
+@torch.no_grad()
+def generate_erotic_only(image, keywords_text="", custom_instruction=""):
+    result = safe_generate_caption_direct(image, "erotic", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
+    return result
+@spaces.GPU(duration=60)
+@torch.no_grad()
+def generate_custom_only(image, keywords_text="", custom_instruction=""):
+    result = safe_generate_caption_direct(image, "custom", max_chars=1000, keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
+    return result
+@spaces.GPU(duration=40)
+@torch.no_grad()
+def answer_question(image, question):
+    """Q&A with simple approach like VQA space"""
     if not image:
         return "❌ Upload image first"
     if not question or not question.strip():
         return "❌ Please ask a question"
+    try:
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Simple Q&A like VQA space
+        convo = [
+            {"role": "system", "content": "You are a helpful image captioner."},
+            {"role": "user", "content": question.strip()}
+        ]
+        convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to("cuda")
+        inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=200,
+                do_sample=True,
+                temperature=0.6,      # Same as VQA space
+                top_p=0.9,           # Same as VQA space
+                top_k=None,
+                use_cache=True,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
+            )
+        # Decode result
+        if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
+            input_length = inputs['input_ids'].shape[1]
+            if len(output[0]) > input_length:
+                generate_ids = output[0][input_length:]
+                result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            else:
+                result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        else:
+            result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        result = result.strip()
+        # Cleanup
+        del inputs, output
+        torch.cuda.empty_cache()
+        gc.collect()
+        final_result = postprocess_caption(result, max_chars=300)
+        return final_result if final_result else "❌ No answer generated"
+    except Exception as e:
+        torch.cuda.empty_cache()
+        gc.collect()
+        return f"❌ Q&A Error: {str(e)[:200]}"
+# ===== Export =====
 def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, erotic_caption, custom_caption, qa_answer, image_path=""):
     try:
         data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source":"JoyCaption","data":{}}
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="📸 Upload Image", height=400)
+            keywords_input = gr.Textbox(label="🏷️ Keywords", lines=2)
+            custom_instruction_input = gr.Textbox(label="🎯 Custom Instruction", lines=2)
+            question_input = gr.Textbox(label="❓ Ask Question", lines=2)
             ask_btn = gr.Button("❓ Ask", variant="secondary")
+            qa_output = gr.Textbox(label="Q&A", lines=4, show_copy_button=True)
         with gr.Column(scale=1):
+            g1 = gr.Button("📝 Casual Descriptive", variant="primary")
+            out1 = gr.Textbox(lines=7, show_copy_button=True)
+            g2 = gr.Button("🔥 Erotic", variant="secondary")
+            out2 = gr.Textbox(lines=7, show_copy_button=True)
+            g3 = gr.Button("🎯 Custom Instruction", variant="secondary")
+            out3 = gr.Textbox(lines=7, show_copy_button=True)
+            export_btn = gr.Button("📥 Export All Data")
             export_out = gr.Textbox(visible=False)
             export_file = gr.File(visible=False)
+    g1.click(generate_engaging_only, [image_input, custom_instruction_input], out1)
+    g2.click(generate_erotic_only, [image_input, keywords_input, custom_instruction_input], out2)
+    g3.click(generate_custom_only, [image_input, keywords_input, custom_instruction_input], out3)
+    ask_btn.click(answer_question, [image_input, question_input], qa_output)
     def handle_export(k, c, q, e1, e2, e3, qa, img):
         msg, fd = export_joycaption_data(k,c,q,e1,e2,e3,qa,img)
         return gr.update(value=msg,visible=True), gr.update(visible=False)
     export_btn.click(handle_export, [keywords_input, custom_instruction_input, question_input, out1, out2, out3, qa_output, image_input], [export_out, export_file])
+    gr.HTML("<hr>")
     gr.Markdown("""
+    ### **🧹 Cache Cleared Version**
+    This version aggressively clears all caches and forces fresh model loading to eliminate any persistent issues from previous versions.
+    **What's different:**
+    - 🧹 All caches cleared at startup
+    - 🔄 Fresh model load with no cached weights
+    - 💾 Cache cleared before each generation
+    - 🎯 Exact parameters from working reference spaces
+    - 📏 Max tokens: 512 (same as beta-one space default)
     """)
 if __name__ == "__main__":