Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 22, 2025

Commit

2f41a1f

verified ·

1 Parent(s): 656a9c0

Update app.py

Browse files

Files changed (1) hide show

app.py +457 -55

app.py CHANGED Viewed

@@ -1,88 +1,490 @@
-"""
-Copy of the full `app.py` into the deploy folder for direct upload.
-This file is a snapshot of the application's main entrypoint and should be
-identical to the root `app.py` when uploading to Hugging Face Spaces.
-"""
 try:
     import spaces
-    # Ensure spaces.GPU exists and is a decorator
-            return f
         return _wrap
     spaces.GPU = _spaces_gpu
-import gradio as gr
-import torch
-from transformers import LlavaForConditionalGeneration, AutoProcessor
         r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
-        # Nudity precision corrections
-        r'\\btopless women\\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
-        r'\\btopless woman\\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
-        # Person count corrections
-        r'\\bthree women\\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
-        r'\\bfour women\\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
-        # Clothing precision
-        r'\\bwearing nothing\\b': 'nude',
-        r'\\bnot wearing.*clothes\\b': 'nude',
-        r'\\bcompletely naked\\b': 'nude',
-        r'\\bfully nude\\b': 'nude',
     }
-    corrected_text = text
-        // Get all textareas and inputs from the page
-        const allInputs = document.querySelectorAll('textarea, input[type="text"]');
-        allInputs.forEach((field, index) => {
-            const placeholder = (field.placeholder || '').toLowerCase();
-            const value = field.value ? field.value.trim() : '';
-                interactive=True,
-                placeholder="Click the button above to generate engaging caption..."
             )
-            # Casual Friend caption
             with gr.Row():
-                with gr.Column(scale=4):
                 interactive=True,
-                placeholder="Click the button above to generate casual friend caption..."
             )
-            # NSFW section removed - caused hallucination
-            # Keywords caption
             with gr.Row():
-                with gr.Column(scale=4):
                 interactive=True,
-                placeholder="Click the button above to generate keywords caption..."
             )
-            # Body Parts Focus section removed - caused hallucination
-            # Descriptive text removed for cleaner interface
-            # Export functionality
             with gr.Row():
-                export_btn = gr.Button(
-    )
-    # NSFW button handler removed
     generate_uncensored_btn.click(
         generate_uncensored_keywords_only,
         inputs=[image_input, keywords_input, custom_instruction_input],
     )
-    # Body Parts Focus button handler removed
-    # Individual reload buttons - using direct generation for consistency
-    def reload_engaging_fn(image, custom_instruction):
-        return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"

 try:
     import spaces
+    if not hasattr(spaces, "GPU"):
+        def _spaces_gpu(*args, **kwargs):
+            def _wrap(f): return f
+            return _wrap
+        spaces.GPU = _spaces_gpu
+except Exception:
+    import types
+    spaces = types.SimpleNamespace()
+    def _spaces_gpu(*args, **kwargs):
+        def _wrap(f): return f
         return _wrap
     spaces.GPU = _spaces_gpu
+@spaces.GPU()
+def _joycaption_register_gpu():
+    # No-op; helps Spaces detect GPU runtime
+    return None
+import gradio as gr
+import torch
+from transformers import LlavaForConditionalGeneration, AutoProcessor
+from PIL import Image
+import tempfile, gc, os, shutil, json, time, re
+from pathlib import Path
+# ---------- Caches → temp ----------
+_tmpdir = tempfile.gettempdir()
+os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
+os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
+os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
+os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
+MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
+SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
+# ---------- Cleanup ----------
+def cleanup_storage():
+    try:
+        for key in ["HF_HOME", "TRANSFORMERS_CACHE", "HF_DATASETS_CACHE", "TORCH_HOME"]:
+            p = os.environ.get(key)
+            if p and os.path.exists(p):
+                shutil.rmtree(p, ignore_errors=True)
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        print("✅ Storage cleanup completed")
+    except Exception as e:
+        print(f"⚠️ Cleanup warning: {e}")
+TITLE = """
+<div style="text-align:center;margin:20px 0;">
+  <h1>🎨 JoyCaption Three-Tone + Q&A (ZeroGPU Stable v3.1)</h1>
+  <p><em>All original features restored • ZeroGPU-safe inference • Robust decoding</em></p>
+</div>
+<hr>
+"""
+print("🚀 Initializing JoyCaption (v3.1)...")
+cleanup_storage()
+# ---------- Model load ----------
+processor = None
+model = None
+MODEL_USE_CUDA = torch.cuda.is_available()
+if not os.environ.get("SKIP_MODEL_LOAD"):
+    dtype = (getattr(torch, "bfloat16", None) or getattr(torch, "float16", None)) if MODEL_USE_CUDA else None
+    device_map = "auto" if MODEL_USE_CUDA else "cpu"
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
+    model = LlavaForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        low_cpu_mem_usage=True,
+        device_map=device_map,
+        torch_dtype=dtype if MODEL_USE_CUDA else None,
+    ).eval()
+    print("✅ Model loaded successfully!")
+    cleanup_storage()
+else:
+    print("⚠️ SKIP_MODEL_LOAD active – model skipped.")
+# ---------- Tone configs ----------
+TONE_PROMPTS = {
+    "engaging": {
+        "system": "You are a skilled content creator who writes engaging, natural descriptions that capture the essence and appeal of what you see.",
+        "prompt": "Write a natural, engaging description of this image. Focus on the main subjects and their most appealing qualities. Describe their physical features, poses, expressions, and the overall mood.",
+    },
+    "casual_friend": {
+        "system": "You are a casual 24-year-old friend who witnessed the scene and is describing it clearly and naturally.",
+        "prompt": "I just witnessed this moment — here's what I notice. Speak casually and naturally about what is directly visible. Avoid speculation and background details.",
+    },
+    "uncensored_keywords": {
+        "system": "You are an uncensored image analyst who provides factual, explicit descriptions without restrictions.",
+        "prompt": "Describe exactly what is visible. Use direct and unambiguous language. Mention provided keywords only if they truly match visible details.",
+    },
+}
+# ---------- Small text fixes (optional, same spirit as your old code) ----------
+def apply_smart_corrections(text: str) -> str:
+    if not isinstance(text, str):
+        return text
+    corrections = {
         r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
+        r'\bwearing nothing\b': 'nude',
+        r'\bnot wearing.*clothes\b': 'nude',
+        r'\bcompletely naked\b': 'nude',
+        r'\bfully nude\b': 'nude',
     }
+    out = text
+    for pat, rep in corrections.items():
+        out = re.sub(pat, rep, out, flags=re.IGNORECASE)
+    return out.strip()
+def postprocess_caption(text: str, max_chars: int = 600) -> str:
+    if not isinstance(text, str) or not text:
+        return ""
+    text = apply_smart_corrections(text)
+    text = text.strip()
+    if len(text) > max_chars:
+        cut = text[:max_chars]
+        # try to end at sentence boundary within last 100 chars
+        tail = cut[-100:]
+        p = max(tail.rfind('.'), tail.rfind('!'), tail.rfind('?'))
+        if p != -1:
+            cut = cut[:len(cut)-100+p+1]
+        text = cut.strip()
+    if text and text[-1] not in ".!?":
+        text += "."
+    return text
+# ---------- Core: prepare inputs (ZeroGPU-safe) ----------
+def _prepare_inputs_and_device(convo, image):
+    # Gradio supplies PIL because we use type="pil"
+    if isinstance(image, (str, Path)):
+        image = Image.open(image).convert("RGB")
+    elif not isinstance(image, Image.Image):
+        raise ValueError("Invalid image input type")
+    # Build conversation string via chat template
+    try:
+        convo_string = processor.apply_chat_template(
+            convo, tokenize=False, add_generation_prompt=True
+        )
+    except Exception:
+        # Fallback: join messages
+        convo_string = "\n".join(str(x.get("content", "")) for x in convo)
+    # Tokenize + encode (always pass lists so processor returns batched tensors)
+    inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
+    # Ensure batch dimension [1, ...] for every tensor (ZeroGPU requires 2D/4D shapes)
+    for k, v in list(inputs.items()):
+        if torch.is_tensor(v):
+            if v.ndim == 1:
+                v = v.unsqueeze(0)      # -> [1, seq_len]
+            elif k == "pixel_values" and v.ndim == 3:
+                v = v.unsqueeze(0)      # -> [1, C, H, W]
+            # bool masks can confuse generate(); cast to int
+            if v.dtype == torch.bool:
+                v = v.to(torch.int)
+            inputs[k] = v
+    # Move to the model device
+    device = next(model.parameters()).device
+    for k, v in inputs.items():
+        if torch.is_tensor(v):
+            inputs[k] = v.to(device, non_blocking=True)
+    return inputs
+# ---------- Core: decode (robust to 1D/2D) ----------
+def _decode_output(inputs, output):
+    if output is None or len(output) == 0:
+        return ""
+    try:
+        input_ids = inputs.get("input_ids")
+        input_len = input_ids.shape[-1] if (isinstance(input_ids, torch.Tensor) and input_ids.ndim > 0) else 0
+        text = processor.tokenizer.decode(
+            output[0][input_len:],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        return text.strip()
+    except Exception as e:
+        print(f"⚠️ Decode fallback: {e}")
+        try:
+            return processor.tokenizer.decode(output[0], skip_special_tokens=True).strip()
+        except Exception:
+            return ""
+def cleanup_after_inference():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+# ---------- Core: generate (no invalid flags on ZeroGPU) ----------
+def run_image_chat_generation(convo, image, max_new_tokens=180):
+    if processor is None or model is None:
+        return None, "❌ Model not initialized."
+    try:
+        inputs = _prepare_inputs_and_device(convo, image)
+        # On ZeroGPU backends, temperature/top_p may be ignored and can even trigger warnings;
+        # keep generation minimal & stable.
+        gen_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=processor.tokenizer.eos_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
+        with torch.no_grad():
+            output = model.generate(**gen_kwargs)
+        decoded = _decode_output(inputs, output)
+        cleanup_after_inference()
+        return decoded, None
+    except Exception as e:
+        cleanup_after_inference()
+        return None, f"❌ Generation error: {str(e)[:300]}"
+# ---------- Caption helpers (features restored) ----------
+def safe_generate_caption_direct(image, tone, keywords_text="", custom_instruction="", max_chars=600):
+    tone_conf = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
+    base_prompt = tone_conf["prompt"]
+    if tone == "uncensored_keywords" and keywords_text and keywords_text.strip():
+        base_prompt += f"\n\nKeywords (ONLY if truly visible): {keywords_text.strip()}"
+    if custom_instruction and custom_instruction.strip():
+        base_prompt += f"\n\nInclude this detail: {custom_instruction.strip()}"
+    convo = [
+        {"role": "system", "content": tone_conf["system"]},
+        {"role": "user", "content": base_prompt},
+    ]
+    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=220)
+    if err:
+        return err
+    return postprocess_caption(decoded or "", max_chars=max_chars) or "❌ Empty result"
+@spaces.GPU(duration=45)
+@torch.no_grad()
+def generate_engaging_only(image, custom_instruction=""):
+    return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
+@spaces.GPU(duration=45)
+@torch.no_grad()
+def generate_casual_friend_only(image, custom_instruction=""):
+    return safe_generate_caption_direct(image, "casual_friend", custom_instruction=custom_instruction) if image else "❌ Upload image first"
+@spaces.GPU(duration=45)
+@torch.no_grad()
+def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
+    return safe_generate_caption_direct(image, "uncensored_keywords", keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
+@spaces.GPU(duration=45)
+@torch.no_grad()
+def answer_question(image, question):
+    if not image: return "❌ Upload image first"
+    if not question or not question.strip(): return "❌ Please ask a question"
+    convo = [
+        {"role": "system", "content": "You are an image analyst who answers honestly and directly."},
+        {"role": "user", "content": f"Answer this question about the image clearly and directly: {question.strip()}"},
+    ]
+    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=220)
+    return err if err else (decoded.strip() or "❌ No answer")
+# ---------- Export ----------
+def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_reference=""):
+    try:
+        data = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "source": "JoyCaption", "data": {}}
+        if keywords and keywords.strip(): data["data"]["keywords"] = keywords.strip()
+        if custom_instructions and custom_instructions.strip(): data["data"]["custom_instructions"] = custom_instructions.strip()
+        if question and question.strip(): data["data"]["question"] = question.strip()
+        if image_reference and image_reference.strip(): data["data"]["image_reference"] = image_reference.strip()
+        if engaging_caption and engaging_caption.strip(): data["data"]["caption_engaging"] = engaging_caption.strip()
+        if casual_caption and casual_caption.strip(): data["data"]["caption_casual_friend"] = casual_caption.strip()
+        if keywords_caption and keywords_caption.strip(): data["data"]["caption_keywords"] = keywords_caption.strip()
+        if qa_answer and qa_answer.strip(): data["data"]["qa_answer"] = qa_answer.strip()
+        if not data["data"]:
+            return "❌ No data to export. Generate some captions first!", None
+        json_string = json.dumps(data, indent=2, ensure_ascii=False)
+        filename = f"joycaption_data_{time.strftime('%Y%m%d_%H%M%S')}.json"
+        return f"✅ Exported {len(data['data'])} fields: {', '.join(data['data'].keys())}", (json_string, filename)
+    except Exception as e:
+        return f"❌ Export failed: {str(e)}", None
+# ---------- Gradio UI (full features restored) ----------
+with gr.Blocks(title="JoyCaption ZeroGPU Stable", theme=gr.themes.Soft()) as demo:
+    gr.HTML(TITLE)
+    with gr.Row():
+        # Left
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="📸 Upload Image", height=400)
+            filename_display = gr.Textbox(
+                label="📂 Uploaded Filename",
+                interactive=False,
+                visible=True,
+                info="Auto-filled when you upload an image"
+            )
+            keywords_input = gr.Textbox(
+                placeholder="e.g., sensual, curves, intimate, alluring...",
+                label="🏷️ Keywords (used only by Uncensored tone)",
+                lines=2
+            )
+            image_reference_input = gr.Textbox(
+                placeholder="e.g., blonde_girl_001.jpg (optional override)",
+                label="🖼️ Image Reference (Manual Override)",
+                lines=1
+            )
+            custom_instruction_input = gr.Textbox(
+                placeholder="e.g., 'from instagram', 'left girl has red hair', 'beach setting'...",
+                label="🎯 Make sure to mention:",
+                lines=2
+            )
+            question_input = gr.Textbox(
+                placeholder="e.g., 'What are they doing?', 'Describe her pose'...",
+                label="❓ Ask a Question",
+                lines=2
             )
             with gr.Row():
+                ask_question_btn = gr.Button("❓ Ask Question", variant="secondary", size="sm")
+                clear_qa_btn = gr.Button("🗑️", size="sm", variant="secondary")
+            qa_output = gr.Textbox(
+                label="Q&A Answer",
+                lines=5,
+                show_copy_button=True,
                 interactive=True,
+                placeholder="Q&A answers will appear here..."
             )
+        # Right
+        with gr.Column(scale=1):
+            with gr.Row():
+                generate_engaging_btn = gr.Button("✨ Engaging", variant="primary", size="sm")
+                reload_engaging = gr.Button("🔄", size="sm", variant="secondary")
+                clear_engaging_btn = gr.Button("🗑️", size="sm", variant="secondary")
+            engaging_output = gr.Textbox(
+                label="Engaging Caption",
+                lines=5,
+                show_copy_button=True,
+                interactive=True,
+                placeholder="Generate engaging caption..."
+            )
             with gr.Row():
+                generate_friend_btn = gr.Button("😎 Casual Friend", variant="primary", size="sm")
+                reload_friend = gr.Button("🔄", size="sm", variant="secondary")
+                clear_friend_btn = gr.Button("🗑️", size="sm", variant="secondary")
+            friend_output = gr.Textbox(
+                label="Casual Friend Caption",
+                lines=5,
+                show_copy_button=True,
                 interactive=True,
+                placeholder="Generate casual caption..."
             )
+            with gr.Row():
+                generate_uncensored_btn = gr.Button("🔴 Uncensored + Keywords", variant="secondary", size="sm")
+                reload_uncensored = gr.Button("🔄", size="sm", variant="secondary")
+                clear_uncensored_btn = gr.Button("🗑️", size="sm", variant="secondary")
+            uncensored_output = gr.Textbox(
+                label="Uncensored + Keywords Caption",
+                lines=5,
+                show_copy_button=True,
+                interactive=True,
+                placeholder="Generate uncensored caption..."
+            )
             with gr.Row():
+                export_btn = gr.Button("📥 Export All Data (JSON)", variant="primary", size="lg")
+            export_output = gr.Textbox(label="Export Status", lines=2, interactive=False, visible=False)
+            export_file = gr.File(label="Download JSON", visible=False)
+    # Filename extraction on upload
+    def extract_filename(image):
+        if image is None:
+            return ""
+        try:
+            if hasattr(image, "filename") and image.filename:
+                return os.path.basename(image.filename)
+        except Exception:
+            pass
+        return "uploaded_image.jpg"
+    image_input.change(extract_filename, inputs=[image_input], outputs=filename_display)
+    # Generation handlers
+    generate_engaging_btn.click(
+        generate_engaging_only,
+        inputs=[image_input, custom_instruction_input],
+        outputs=engaging_output,
+        show_progress=True
+    )
+    generate_friend_btn.click(
+        generate_casual_friend_only,
+        inputs=[image_input, custom_instruction_input],
+        outputs=friend_output,
+        show_progress=True
+    )
     generate_uncensored_btn.click(
         generate_uncensored_keywords_only,
         inputs=[image_input, keywords_input, custom_instruction_input],
+        outputs=uncensored_output,
+        show_progress=True
+    )
+    # Reload handlers
+    reload_engaging.click(
+        generate_engaging_only,
+        inputs=[image_input, custom_instruction_input],
+        outputs=engaging_output,
+        show_progress=True
     )
+    reload_friend.click(
+        generate_casual_friend_only,
+        inputs=[image_input, custom_instruction_input],
+        outputs=friend_output,
+        show_progress=True
+    )
+    reload_uncensored.click(
+        generate_uncensored_keywords_only,
+        inputs=[image_input, keywords_input, custom_instruction_input],
+        outputs=uncensored_output,
+        show_progress=True
+    )
+    # Q&A
+    ask_question_btn.click(
+        answer_question,
+        inputs=[image_input, question_input],
+        outputs=qa_output,
+        show_progress=True
+    )
+    # Clear buttons
+    def clear_text(): return ""
+    clear_qa_btn.click(clear_text, outputs=qa_output)
+    clear_engaging_btn.click(clear_text, outputs=engaging_output)
+    clear_friend_btn.click(clear_text, outputs=friend_output)
+    clear_uncensored_btn.click(clear_text, outputs=uncensored_output)
+    # Export (writes into temp dir so it works on Spaces)
+    def handle_export(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_reference, upload_filename):
+        image_ref = (upload_filename or "").strip() or (image_reference or "")
+        message, file_data = export_joycaption_data(
+            keywords, custom_instructions, question,
+            engaging_caption, casual_caption, keywords_caption, qa_answer,
+            image_ref
+        )
+        if file_data:
+            json_string, fname = file_data
+            temp_path = os.path.join(tempfile.gettempdir(), fname)
+            with open(temp_path, "w", encoding="utf-8") as f:
+                f.write(json_string)
+            return gr.update(value=message, visible=True), gr.update(value=temp_path, visible=True)
+        else:
+            return gr.update(value=message, visible=True), gr.update(visible=False)
+    export_btn.click(
+        handle_export,
+        inputs=[
+            keywords_input, custom_instruction_input, question_input,
+            engaging_output, friend_output, uncensored_output, qa_output,
+            image_reference_input, filename_display
+        ],
+        outputs=[export_output, export_file]
+    ).then(lambda: gr.update(visible=True), outputs=[export_output]) \
+     .then(lambda: gr.update(visible=True), outputs=[export_file])
+if __name__ == "__main__":
+    demo.launch()