Spaces:

Hug0endob
/

Joycaption-basic

Build error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

028a367

verified ·

1 Parent(s): 09c7c56

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -40

app.py CHANGED Viewed

@@ -1,38 +1,20 @@
 import os
 import re
 import torch
 import requests
-from io import BytesIO
 from PIL import Image, ImageSequence
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 import gradio as gr
-# ---------------------------
-# Config
-# ---------------------------
 MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"
-HF_TOKEN = os.getenv("HF_TOKEN")  # optional secret in Space settings
-# ---------------------------
-# Load model & processor
-# ---------------------------
-token_arg = {"token": HF_TOKEN} if HF_TOKEN else {}
-processor = AutoProcessor.from_pretrained(MODEL_NAME, **token_arg)
-llava_model = LlavaForConditionalGeneration.from_pretrained(
-    MODEL_NAME,
-    device_map="cpu",
-    torch_dtype=torch.bfloat16,
-    **token_arg,
-)
-llava_model.eval()
-# ---------------------------
-# Helpers
-# ---------------------------
 def download_bytes(url: str, timeout: int = 30) -> bytes:
-    resp = requests.get(url, stream=True, timeout=timeout)
-    resp.raise_for_status()
-    return resp.content
 def mp4_to_gif(mp4_bytes: bytes) -> bytes:
     files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
@@ -53,21 +35,37 @@ def mp4_to_gif(mp4_bytes: bytes) -> bytes:
         gif_url = "https:" + gif_url
     elif gif_url.startswith("/"):
         gif_url = "https://s.ezgif.com" + gif_url
-    gif_resp = requests.get(gif_url, timeout=60)
-    gif_resp.raise_for_status()
-    return gif_resp.content
 def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
-    img = Image.open(BytesIO(raw))
     if getattr(img, "is_animated", False):
         img = next(ImageSequence.Iterator(img))
     if img.mode != "RGB":
         img = img.convert("RGB")
     return img
-# ---------------------------
 # Main inference
-# ---------------------------
 def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
     if not url:
         return "No URL provided."
@@ -78,7 +76,7 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     lower = url.lower().split("?")[0]
     try:
-        # crude MP4 detection by extension or ftyp box signature
         if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
             try:
                 raw = mp4_to_gif(raw)
@@ -88,9 +86,36 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     except Exception as e:
         return f"Image processing error: {e}"
     try:
-        inputs = processor(images=img, text=prompt, return_tensors="pt")
-        inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
         with torch.no_grad():
             out_ids = llava_model.generate(**inputs, max_new_tokens=128)
         caption = processor.decode(out_ids[0], skip_special_tokens=True)
@@ -98,10 +123,7 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     except Exception as e:
         return f"Inference error: {e}"
-# ---------------------------
-# Gradio UI (compatible init)
-# ---------------------------
-# Use try/except to support Gradio versions that don't accept allow_flagging
 gradio_kwargs = dict(
     fn=generate_caption_from_url,
     inputs=[
@@ -109,8 +131,8 @@ gradio_kwargs = dict(
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
-    title="JoyCaption (fancyfeast) - URL input",
-    description="Paste a direct link to an image, GIF, or MP4. MP4 files are converted to GIF via ezgif.com; the first frame is captioned.",
 )
 try:
@@ -119,4 +141,14 @@ except TypeError:
     iface = gr.Interface(**gradio_kwargs)
 if __name__ == "__main__":
-    iface.launch()

 import os
 import re
+import io
 import torch
 import requests
 from PIL import Image, ImageSequence
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 import gradio as gr
 MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"
+HF_TOKEN = os.getenv("HF_TOKEN")  # optional
+# Helper: download bytes safely
 def download_bytes(url: str, timeout: int = 30) -> bytes:
+    with requests.get(url, stream=True, timeout=timeout) as resp:
+        resp.raise_for_status()
+        return resp.content
 def mp4_to_gif(mp4_bytes: bytes) -> bytes:
     files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
         gif_url = "https:" + gif_url
     elif gif_url.startswith("/"):
         gif_url = "https://s.ezgif.com" + gif_url
+    with requests.get(gif_url, timeout=60) as gif_resp:
+        gif_resp.raise_for_status()
+        return gif_resp.content
 def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
+    img = Image.open(io.BytesIO(raw))
     if getattr(img, "is_animated", False):
         img = next(ImageSequence.Iterator(img))
     if img.mode != "RGB":
         img = img.convert("RGB")
     return img
+# Load processor + model
+token_arg = {"use_auth_token": HF_TOKEN} if HF_TOKEN else {}
+# Some HF model variants require trust_remote_code and num_additional_image_tokens
+processor = AutoProcessor.from_pretrained(
+    MODEL_NAME,
+    trust_remote_code=True,
+    num_additional_image_tokens=1,  # safe default for many forks that use a CLS token
+    **({} if not HF_TOKEN else {"token": HF_TOKEN})
+)
+llava_model = LlavaForConditionalGeneration.from_pretrained(
+    MODEL_NAME,
+    device_map="cpu",
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    trust_remote_code=True,
+    **({} if not HF_TOKEN else {"token": HF_TOKEN})
+)
+llava_model.eval()
 # Main inference
 def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
     if not url:
         return "No URL provided."
     lower = url.lower().split("?")[0]
     try:
+        # crude MP4 detection
         if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
             try:
                 raw = mp4_to_gif(raw)
     except Exception as e:
         return f"Image processing error: {e}"
+    # Resize to safe resolution expected by many VLMs (adjust if your model docs say otherwise)
     try:
+        img = img.resize((512, 512), resample=Image.BICUBIC)
+    except Exception:
+        pass
+    try:
+        # Build conversation/chat input so processor inserts image placeholder correctly
+        conversation = [
+            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
+        ]
+        inputs = processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            return_dict=True,
+            images=img,
+        )
+        # Move to model device and match dtype for pixel values
+        device = llava_model.device
+        inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(dtype=llava_model.dtype, device=device)
+        # Debug shapes (helpful if mismatch persists)
+        if "pixel_values" in inputs:
+            print("pixel_values.shape:", inputs["pixel_values"].shape)
+        if "input_ids" in inputs:
+            print("input_ids.shape:", inputs["input_ids"].shape)
         with torch.no_grad():
             out_ids = llava_model.generate(**inputs, max_new_tokens=128)
         caption = processor.decode(out_ids[0], skip_special_tokens=True)
     except Exception as e:
         return f"Inference error: {e}"
+# Gradio UI
 gradio_kwargs = dict(
     fn=generate_caption_from_url,
     inputs=[
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
+    title="JoyCaption - URL input",
+    description="Paste a direct link to an image/GIF/MP4 (MP4 will be converted).",
 )
 try:
     iface = gr.Interface(**gradio_kwargs)
 if __name__ == "__main__":
+    try:
+        iface.launch(server_name="0.0.0.0", server_port=7860)
+    finally:
+        # close event loop safely in Spaces environment
+        try:
+            import asyncio
+            loop = asyncio.get_event_loop()
+            if not loop.is_closed():
+                loop.close()
+        except Exception:
+            pass