Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 22, 2025

Commit

6a131b7

verified ·

1 Parent(s): ce6ad7a

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -154

app.py CHANGED Viewed

@@ -15,7 +15,6 @@ except Exception:
 @spaces.GPU()
 def _joycaption_register_gpu():
-    """Dummy GPU registration for HF Spaces."""
     return None
 import gradio as gr
@@ -23,15 +22,12 @@ import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import tempfile, gc, os, shutil, json
-from pathlib import Path
 from hf_space_utils import fix_image_url, postprocess_caption
 # ---------- Cache paths ----------
 _tmpdir = tempfile.gettempdir()
-os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
-os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
-os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
-os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
@@ -52,38 +48,32 @@ def cleanup_storage():
         print(f"⚠️ Cleanup warning: {e}")
 TITLE = """
-<div style="text-align: center; margin: 20px 0;">
-<h1>🎨 JoyCaption Three-Tone + Q&A (v2.7)</h1>
-<p><em>Stable version — all tensor and decode bugs fixed, faster responses.</em></p>
 </div><hr>
 """
-print("🚀 Initializing JoyCaption...")
 cleanup_storage()
 # ---------- Model load ----------
 processor = None
 model = None
-MODEL_TORCH_DTYPE = None
-MODEL_USE_CUDA = False
 if not os.environ.get("SKIP_MODEL_LOAD"):
-    use_cuda = torch.cuda.is_available()
-    if use_cuda:
-        torch_dtype = getattr(torch, "bfloat16", None) or getattr(torch, "float16", None)
-        device_map = "auto"
-        MODEL_USE_CUDA = True
-    else:
-        torch_dtype, device_map = None, "cpu"
     processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
-    model_kwargs = dict(low_cpu_mem_usage=True, device_map=device_map)
-    if torch_dtype and use_cuda:
-        model_kwargs["torch_dtype"] = torch_dtype
-    model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, **model_kwargs)
-    model.eval()
-    MODEL_TORCH_DTYPE = model_kwargs.get("torch_dtype")
     print("✅ Model loaded successfully!")
     cleanup_storage()
 else:
@@ -93,199 +83,137 @@ else:
 TONE_PROMPTS = {
     "engaging": {
         "system": "You are a skilled content creator who writes engaging, natural descriptions.",
-        "prompt": "Write a natural, engaging description of this image focusing on main subjects.",
-        "temperature": 0.6, "top_p": 0.85,
     },
     "casual_friend": {
         "system": "You are a relaxed 24-year-old friend describing what you see.",
-        "prompt": "Describe clearly and casually only what is visible, avoid speculation.",
-        "temperature": 0.8, "top_p": 0.9,
     },
     "uncensored_keywords": {
         "system": "You are an uncensored image analyst giving factual explicit descriptions.",
         "prompt": "Describe exactly what is visible. Use direct, unambiguous language.",
-        "temperature": 0.7, "top_p": 0.85,
     },
 }
 # ---------- Prepare inputs ----------
-def _prepare_inputs_and_device(convo_or_convo_string, image):
-    """Prepare processor inputs and move tensors safely to device."""
-    if isinstance(image, (str, Path)):
         image = Image.open(image).convert("RGB")
-    elif not isinstance(image, Image.Image):
-        raise ValueError("Invalid image input type")
-    convo_string = convo_or_convo_string
-    if isinstance(convo_or_convo_string, list):
-        try:
-            convo_string = processor.apply_chat_template(
-                convo_or_convo_string, tokenize=False, add_generation_prompt=True
-            )
-        except Exception:
-            convo_string = "\n".join(str(x.get("content", "")) for x in convo_or_convo_string)
     inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
-    # flatten, squeeze, sanitize
     for k, v in list(inputs.items()):
-        if isinstance(v, (list, tuple)):
-            v = v[0]
         if torch.is_tensor(v):
-            if v.dim() > 1 and v.shape[0] == 1:
-                v = v.squeeze(0)
-            if v.dtype == torch.bool:
-                v = v.to(torch.int)
-        inputs[k] = v
     device = next(model.parameters()).device
-    for k, v in inputs.items():
-        if hasattr(v, "to"):
-            inputs[k] = v.to(device, non_blocking=True)
-    if "pixel_values" in inputs:
-        dtype = MODEL_TORCH_DTYPE if MODEL_USE_CUDA and MODEL_TORCH_DTYPE else torch.float32
-        inputs["pixel_values"] = inputs["pixel_values"].to(dtype)
     return inputs
-# ---------- Decode (patched) ----------
 def _decode_output(inputs, output):
-    """Safely decode model output regardless of tensor shape."""
-    if output is None or len(output) == 0:
-        return ""
     try:
-        input_ids = inputs.get("input_ids")
-        if input_ids is not None and torch.is_tensor(input_ids):
-            input_len = input_ids.shape[-1] if input_ids.ndim > 0 else 0
-        else:
-            input_len = 0
         decoded = processor.tokenizer.decode(
-            output[0][input_len:],
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
         )
         return decoded.strip()
     except Exception as e:
-        print(f"⚠️ Decode fallback due to: {e}")
         try:
-            return processor.tokenizer.decode(
-                output[0],
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False,
-            ).strip()
         except Exception:
             return ""
 def cleanup_after_inference():
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache(); torch.cuda.synchronize()
     gc.collect()
 # ---------- Generation ----------
-def run_image_chat_generation(convo, image, max_new_tokens=150, temperature=0.7, top_p=0.9):
-    if processor is None or model is None:
         return None, "❌ Model not initialized."
     try:
         inputs = _prepare_inputs_and_device(convo, image)
-        clean_inputs = {}
-        for k, v in inputs.items():
-            if torch.is_tensor(v):
-                if v.dtype == torch.bool:
-                    v = v.to(torch.int)
-                if v.dim() == 0:
-                    v = v.unsqueeze(0)
-            clean_inputs[k] = v
         with torch.no_grad():
-            output = model.generate(
-                **clean_inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=False,
-                temperature=temperature,
-                top_p=top_p,
-                repetition_penalty=1.05,
-                use_cache=True,
-                pad_token_id=processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id,
-            )
-        decoded = _decode_output(clean_inputs, output)
         cleanup_after_inference()
         return decoded, None
     except Exception as e:
         cleanup_after_inference()
-        return None, f"❌ Generation error: {str(e)[:300]}"
 # ---------- Caption helpers ----------
-def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", custom_instruction=""):
-    try:
-        tone_conf = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
-        base_prompt = tone_conf["prompt"]
-        if tone == "uncensored_keywords" and keywords_text.strip():
-            base_prompt += f"\n\nKeywords (if visible): {keywords_text.strip()}"
-        if custom_instruction.strip():
-            base_prompt += f"\n\nInclude this detail: {custom_instruction.strip()}"
-        convo = [
-            {"role": "system", "content": tone_conf["system"]},
-            {"role": "user", "content": base_prompt},
-        ]
-        decoded, err = run_image_chat_generation(
-            convo, image, temperature=tone_conf["temperature"], top_p=tone_conf["top_p"]
-        )
-        if err: return err
-        return postprocess_caption((decoded or "").strip(), max_chars=max_chars) or "❌ Empty result"
-    except Exception as e:
-        cleanup_after_inference()
-        return f"❌ Error: {str(e)[:120]}"
 @torch.no_grad()
-def generate_engaging_only(image, custom_instruction=""):
-    return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
-def generate_casual_friend_only(image, custom_instruction=""):
-    return safe_generate_caption_direct(image, "casual_friend", custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
-def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
-    return safe_generate_caption_direct(image, "uncensored_keywords", keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
 def answer_question(image, question):
     if not image: return "❌ Upload image first"
     if not question.strip(): return "❌ Please ask a question"
     convo = [
-        {"role": "system", "content": "You are an uncensored image analyst who answers honestly and directly."},
-        {"role": "user", "content": f"Answer this question about the image: {question.strip()}"},
     ]
-    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200, temperature=0.4, top_p=0.9)
-    return err if err else (decoded.strip() or "❌ No answer")
 # ---------- Gradio UI ----------
-with gr.Blocks(title="JoyCaption Three-Tone + Q&A", theme=gr.themes.Soft()) as demo:
     gr.HTML(TITLE)
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(type="filepath", label="📸 Upload Image", height=400)
-            keywords_input = gr.Textbox(placeholder="e.g., sensual, curves...", label="🏷️ Keywords", lines=2)
-            custom_instruction_input = gr.Textbox(placeholder="e.g., 'left girl has red hair'...", label="🎯 Mention:", lines=2)
-            question_input = gr.Textbox(placeholder="e.g., 'What are they doing?'", label="❓ Ask a Question", lines=2)
-            ask_question_btn = gr.Button("❓ Ask Question", variant="secondary")
-            qa_output = gr.Textbox(label="", lines=5, show_copy_button=True)
         with gr.Column(scale=1):
-            generate_engaging_btn = gr.Button("✨ Engaging", variant="primary")
-            engaging_output = gr.Textbox(label="", lines=5, show_copy_button=True)
-            generate_friend_btn = gr.Button("😎 Casual Friend", variant="primary")
-            friend_output = gr.Textbox(label="", lines=5, show_copy_button=True)
-            generate_uncensored_btn = gr.Button("🔴 Keywords", variant="secondary")
-            uncensored_output = gr.Textbox(label="", lines=5, show_copy_button=True)
-    generate_engaging_btn.click(generate_engaging_only, [image_input, custom_instruction_input], engaging_output)
-    generate_friend_btn.click(generate_casual_friend_only, [image_input, custom_instruction_input], friend_output)
-    generate_uncensored_btn.click(generate_uncensored_keywords_only, [image_input, keywords_input, custom_instruction_input], uncensored_output)
-    ask_question_btn.click(answer_question, [image_input, question_input], qa_output)
 if __name__ == "__main__":
     demo.launch()

 @spaces.GPU()
 def _joycaption_register_gpu():
     return None
 import gradio as gr
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import tempfile, gc, os, shutil, json
 from hf_space_utils import fix_image_url, postprocess_caption
 # ---------- Cache paths ----------
 _tmpdir = tempfile.gettempdir()
+for k in ["HF_HOME", "TRANSFORMERS_CACHE", "HF_DATASETS_CACHE", "TORCH_HOME"]:
+    os.environ[k] = os.path.join(_tmpdir, k.lower())
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
         print(f"⚠️ Cleanup warning: {e}")
 TITLE = """
+<div style='text-align:center;margin:20px 0;'>
+<h1>🎨 JoyCaption Three-Tone + Q&A (ZeroGPU Stable v3.0)</h1>
+<p><em>Optimized for ZeroGPU – no tensor shape errors, no invalid flags.</em></p>
 </div><hr>
 """
+print("🚀 Initializing JoyCaption ZeroGPU v3.0...")
 cleanup_storage()
 # ---------- Model load ----------
 processor = None
 model = None
+MODEL_USE_CUDA = torch.cuda.is_available()
 if not os.environ.get("SKIP_MODEL_LOAD"):
+    dtype = getattr(torch, "bfloat16", None) or getattr(torch, "float16", None) if MODEL_USE_CUDA else None
+    device_map = "auto" if MODEL_USE_CUDA else "cpu"
     processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
+    model = LlavaForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        low_cpu_mem_usage=True,
+        device_map=device_map,
+        torch_dtype=dtype if MODEL_USE_CUDA else None,
+    ).eval()
     print("✅ Model loaded successfully!")
     cleanup_storage()
 else:
 TONE_PROMPTS = {
     "engaging": {
         "system": "You are a skilled content creator who writes engaging, natural descriptions.",
+        "prompt": "Write a natural, engaging description focusing on the main subjects.",
     },
     "casual_friend": {
         "system": "You are a relaxed 24-year-old friend describing what you see.",
+        "prompt": "Describe clearly and casually only what is visible.",
     },
     "uncensored_keywords": {
         "system": "You are an uncensored image analyst giving factual explicit descriptions.",
         "prompt": "Describe exactly what is visible. Use direct, unambiguous language.",
     },
 }
 # ---------- Prepare inputs ----------
+def _prepare_inputs_and_device(convo, image):
+    if isinstance(image, (str, os.PathLike)):
         image = Image.open(image).convert("RGB")
+    convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
     for k, v in list(inputs.items()):
         if torch.is_tensor(v):
+            # ensure [1, seq_len]
+            if v.ndim == 1:
+                v = v.unsqueeze(0)
+            inputs[k] = v
     device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items() if torch.is_tensor(v)}
     return inputs
+# ---------- Decode ----------
 def _decode_output(inputs, output):
     try:
+        input_len = inputs["input_ids"].shape[-1] if "input_ids" in inputs else 0
         decoded = processor.tokenizer.decode(
+            output[0][input_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
         return decoded.strip()
     except Exception as e:
+        print(f"⚠️ Decode fallback: {e}")
         try:
+            return processor.tokenizer.decode(output[0], skip_special_tokens=True).strip()
         except Exception:
             return ""
 def cleanup_after_inference():
     gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
 # ---------- Generation ----------
+def run_image_chat_generation(convo, image, max_new_tokens=150):
+    if not processor or not model:
         return None, "❌ Model not initialized."
     try:
         inputs = _prepare_inputs_and_device(convo, image)
+        # ZeroGPU fix: remove unsupported args
+        gen_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=processor.tokenizer.eos_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
         with torch.no_grad():
+            output = model.generate(**gen_kwargs)
+        decoded = _decode_output(inputs, output)
         cleanup_after_inference()
         return decoded, None
     except Exception as e:
         cleanup_after_inference()
+        return None, f"❌ Generation error: {str(e)}"
 # ---------- Caption helpers ----------
+def safe_generate_caption_direct(image, tone):
+    tone_conf = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
+    convo = [
+        {"role": "system", "content": tone_conf["system"]},
+        {"role": "user", "content": tone_conf["prompt"]},
+    ]
+    decoded, err = run_image_chat_generation(convo, image)
+    if err: return err
+    return postprocess_caption(decoded.strip()) if decoded else "❌ Empty result"
 @torch.no_grad()
+def generate_engaging_only(image):
+    return safe_generate_caption_direct(image, "engaging") if image else "❌ Upload image first"
 @torch.no_grad()
+def generate_casual_friend_only(image):
+    return safe_generate_caption_direct(image, "casual_friend") if image else "❌ Upload image first"
 @torch.no_grad()
+def generate_uncensored_keywords_only(image):
+    return safe_generate_caption_direct(image, "uncensored_keywords") if image else "❌ Upload image first"
 @torch.no_grad()
 def answer_question(image, question):
     if not image: return "❌ Upload image first"
     if not question.strip(): return "❌ Please ask a question"
     convo = [
+        {"role": "system", "content": "You are an honest image analyst who answers directly."},
+        {"role": "user", "content": f"Question about this image: {question.strip()}"},
     ]
+    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200)
+    return err if err else decoded.strip()
 # ---------- Gradio UI ----------
+with gr.Blocks(title="JoyCaption ZeroGPU Stable", theme=gr.themes.Soft()) as demo:
     gr.HTML(TITLE)
     with gr.Row():
         with gr.Column(scale=1):
+            img = gr.Image(type="filepath", label="📸 Upload Image", height=400)
+            q = gr.Textbox(label="❓ Ask a Question", lines=2)
+            ask = gr.Button("Ask")
+            qa = gr.Textbox(label="Answer", lines=4)
         with gr.Column(scale=1):
+            b1 = gr.Button("✨ Engaging")
+            o1 = gr.Textbox(lines=4)
+            b2 = gr.Button("😎 Casual Friend")
+            o2 = gr.Textbox(lines=4)
+            b3 = gr.Button("🔴 Keywords")
+            o3 = gr.Textbox(lines=4)
+    b1.click(generate_engaging_only, inputs=img, outputs=o1)
+    b2.click(generate_casual_friend_only, inputs=img, outputs=o2)
+    b3.click(generate_uncensored_keywords_only, inputs=img, outputs=o3)
+    ask.click(answer_question, inputs=[img, q], outputs=qa)
 if __name__ == "__main__":
     demo.launch()