Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 22, 2025

Commit

5c3558b

verified ·

1 Parent(s): 6a131b7

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -210

app.py CHANGED Viewed

@@ -1,219 +1,88 @@
 try:
     import spaces
-    if not hasattr(spaces, "GPU"):
-        def _spaces_gpu(*args, **kwargs):
-            def _wrap(f): return f
-            return _wrap
-        spaces.GPU = _spaces_gpu
-except Exception:
-    import types
-    spaces = types.SimpleNamespace()
-    def _spaces_gpu(*args, **kwargs):
-        def _wrap(f): return f
         return _wrap
     spaces.GPU = _spaces_gpu
-@spaces.GPU()
-def _joycaption_register_gpu():
-    return None
 import gradio as gr
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
-from PIL import Image
-import tempfile, gc, os, shutil, json
-from hf_space_utils import fix_image_url, postprocess_caption
-# ---------- Cache paths ----------
-_tmpdir = tempfile.gettempdir()
-for k in ["HF_HOME", "TRANSFORMERS_CACHE", "HF_DATASETS_CACHE", "TORCH_HOME"]:
-    os.environ[k] = os.path.join(_tmpdir, k.lower())
-MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
-SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
-# ---------- Cleanup ----------
-def cleanup_storage():
-    try:
-        for key in ["HF_HOME", "TRANSFORMERS_CACHE", "HF_DATASETS_CACHE", "TORCH_HOME"]:
-            p = os.environ.get(key)
-            if p and os.path.exists(p):
-                shutil.rmtree(p, ignore_errors=True)
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-        print("✅ Storage cleanup completed")
-    except Exception as e:
-        print(f"⚠️ Cleanup warning: {e}")
-TITLE = """
-<div style='text-align:center;margin:20px 0;'>
-<h1>🎨 JoyCaption Three-Tone + Q&A (ZeroGPU Stable v3.0)</h1>
-<p><em>Optimized for ZeroGPU – no tensor shape errors, no invalid flags.</em></p>
-</div><hr>
-"""
-print("🚀 Initializing JoyCaption ZeroGPU v3.0...")
-cleanup_storage()
-# ---------- Model load ----------
-processor = None
-model = None
-MODEL_USE_CUDA = torch.cuda.is_available()
-if not os.environ.get("SKIP_MODEL_LOAD"):
-    dtype = getattr(torch, "bfloat16", None) or getattr(torch, "float16", None) if MODEL_USE_CUDA else None
-    device_map = "auto" if MODEL_USE_CUDA else "cpu"
-    processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
-    model = LlavaForConditionalGeneration.from_pretrained(
-        MODEL_PATH,
-        low_cpu_mem_usage=True,
-        device_map=device_map,
-        torch_dtype=dtype if MODEL_USE_CUDA else None,
-    ).eval()
-    print("✅ Model loaded successfully!")
-    cleanup_storage()
-else:
-    print("⚠️ SKIP_MODEL_LOAD active – model skipped.")
-# ---------- Tone configs ----------
-TONE_PROMPTS = {
-    "engaging": {
-        "system": "You are a skilled content creator who writes engaging, natural descriptions.",
-        "prompt": "Write a natural, engaging description focusing on the main subjects.",
-    },
-    "casual_friend": {
-        "system": "You are a relaxed 24-year-old friend describing what you see.",
-        "prompt": "Describe clearly and casually only what is visible.",
-    },
-    "uncensored_keywords": {
-        "system": "You are an uncensored image analyst giving factual explicit descriptions.",
-        "prompt": "Describe exactly what is visible. Use direct, unambiguous language.",
-    },
-}
-# ---------- Prepare inputs ----------
-def _prepare_inputs_and_device(convo, image):
-    if isinstance(image, (str, os.PathLike)):
-        image = Image.open(image).convert("RGB")
-    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
-    for k, v in list(inputs.items()):
-        if torch.is_tensor(v):
-            # ensure [1, seq_len]
-            if v.ndim == 1:
-                v = v.unsqueeze(0)
-            inputs[k] = v
-    device = next(model.parameters()).device
-    inputs = {k: v.to(device) for k, v in inputs.items() if torch.is_tensor(v)}
-    return inputs
-# ---------- Decode ----------
-def _decode_output(inputs, output):
-    try:
-        input_len = inputs["input_ids"].shape[-1] if "input_ids" in inputs else 0
-        decoded = processor.tokenizer.decode(
-            output[0][input_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        return decoded.strip()
-    except Exception as e:
-        print(f"⚠️ Decode fallback: {e}")
-        try:
-            return processor.tokenizer.decode(output[0], skip_special_tokens=True).strip()
-        except Exception:
-            return ""
-def cleanup_after_inference():
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-# ---------- Generation ----------
-def run_image_chat_generation(convo, image, max_new_tokens=150):
-    if not processor or not model:
-        return None, "❌ Model not initialized."
-    try:
-        inputs = _prepare_inputs_and_device(convo, image)
-        # ZeroGPU fix: remove unsupported args
-        gen_kwargs = dict(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            pad_token_id=processor.tokenizer.eos_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-        )
-        with torch.no_grad():
-            output = model.generate(**gen_kwargs)
-        decoded = _decode_output(inputs, output)
-        cleanup_after_inference()
-        return decoded, None
-    except Exception as e:
-        cleanup_after_inference()
-        return None, f"❌ Generation error: {str(e)}"
-# ---------- Caption helpers ----------
-def safe_generate_caption_direct(image, tone):
-    tone_conf = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
-    convo = [
-        {"role": "system", "content": tone_conf["system"]},
-        {"role": "user", "content": tone_conf["prompt"]},
-    ]
-    decoded, err = run_image_chat_generation(convo, image)
-    if err: return err
-    return postprocess_caption(decoded.strip()) if decoded else "❌ Empty result"
-@torch.no_grad()
-def generate_engaging_only(image):
-    return safe_generate_caption_direct(image, "engaging") if image else "❌ Upload image first"
-@torch.no_grad()
-def generate_casual_friend_only(image):
-    return safe_generate_caption_direct(image, "casual_friend") if image else "❌ Upload image first"
-@torch.no_grad()
-def generate_uncensored_keywords_only(image):
-    return safe_generate_caption_direct(image, "uncensored_keywords") if image else "❌ Upload image first"
-@torch.no_grad()
-def answer_question(image, question):
-    if not image: return "❌ Upload image first"
-    if not question.strip(): return "❌ Please ask a question"
-    convo = [
-        {"role": "system", "content": "You are an honest image analyst who answers directly."},
-        {"role": "user", "content": f"Question about this image: {question.strip()}"},
-    ]
-    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200)
-    return err if err else decoded.strip()
-# ---------- Gradio UI ----------
-with gr.Blocks(title="JoyCaption ZeroGPU Stable", theme=gr.themes.Soft()) as demo:
-    gr.HTML(TITLE)
-    with gr.Row():
-        with gr.Column(scale=1):
-            img = gr.Image(type="filepath", label="📸 Upload Image", height=400)
-            q = gr.Textbox(label="❓ Ask a Question", lines=2)
-            ask = gr.Button("Ask")
-            qa = gr.Textbox(label="Answer", lines=4)
-        with gr.Column(scale=1):
-            b1 = gr.Button("✨ Engaging")
-            o1 = gr.Textbox(lines=4)
-            b2 = gr.Button("😎 Casual Friend")
-            o2 = gr.Textbox(lines=4)
-            b3 = gr.Button("🔴 Keywords")
-            o3 = gr.Textbox(lines=4)
-    b1.click(generate_engaging_only, inputs=img, outputs=o1)
-    b2.click(generate_casual_friend_only, inputs=img, outputs=o2)
-    b3.click(generate_uncensored_keywords_only, inputs=img, outputs=o3)
-    ask.click(answer_question, inputs=[img, q], outputs=qa)
-if __name__ == "__main__":
-    demo.launch()

+"""
+Copy of the full `app.py` into the deploy folder for direct upload.
+This file is a snapshot of the application's main entrypoint and should be
+identical to the root `app.py` when uploading to Hugging Face Spaces.
+"""
 try:
     import spaces
+    # Ensure spaces.GPU exists and is a decorator
+            return f
         return _wrap
     spaces.GPU = _spaces_gpu
 import gradio as gr
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
+        r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
+        # Nudity precision corrections
+        r'\\btopless women\\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
+        r'\\btopless woman\\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
+        # Person count corrections
+        r'\\bthree women\\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
+        r'\\bfour women\\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
+        # Clothing precision
+        r'\\bwearing nothing\\b': 'nude',
+        r'\\bnot wearing.*clothes\\b': 'nude',
+        r'\\bcompletely naked\\b': 'nude',
+        r'\\bfully nude\\b': 'nude',
+    }
+    corrected_text = text
+        // Get all textareas and inputs from the page
+        const allInputs = document.querySelectorAll('textarea, input[type="text"]');
+        allInputs.forEach((field, index) => {
+            const placeholder = (field.placeholder || '').toLowerCase();
+            const value = field.value ? field.value.trim() : '';
+                interactive=True,
+                placeholder="Click the button above to generate engaging caption..."
+            )
+            # Casual Friend caption
+            with gr.Row():
+                with gr.Column(scale=4):
+                interactive=True,
+                placeholder="Click the button above to generate casual friend caption..."
+            )
+            # NSFW section removed - caused hallucination
+            # Keywords caption
+            with gr.Row():
+                with gr.Column(scale=4):
+                interactive=True,
+                placeholder="Click the button above to generate keywords caption..."
+            )
+            # Body Parts Focus section removed - caused hallucination
+            # Descriptive text removed for cleaner interface
+            # Export functionality
+            with gr.Row():
+                export_btn = gr.Button(
+    )
+    # NSFW button handler removed
+    generate_uncensored_btn.click(
+        generate_uncensored_keywords_only,
+        inputs=[image_input, keywords_input, custom_instruction_input],
+    )
+    # Body Parts Focus button handler removed
+    # Individual reload buttons - using direct generation for consistency
+    def reload_engaging_fn(image, custom_instruction):
+        return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"