Spaces:

Hug0endob
/

Joycaption-basic

Build error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

7766a5c

verified ·

1 Parent(s): f275d7c

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -86

app.py CHANGED Viewed

@@ -1,135 +1,103 @@
-import os, torch
-from transformers import AutoProcessor, LlavaForConditionalGeneration
-import gradio as gr
-from PIL import Image, ImageSequence
 import requests
 from io import BytesIO
-# ---- 1️⃣ Use a public repo ----
-MODEL_NAME = "llava-hf/joycaption-llama3.1-8b"   # public version
-processor = AutoProcessor.from_pretrained(MODEL_NAME)
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
     torch_dtype=torch.bfloat16,
 )
 llava_model.eval()
-# -------------------------------------------------
-# Helper: download a file from a URL
-# -------------------------------------------------
-def download_bytes(url: str) -> bytes:
-    resp = requests.get(url, stream=True, timeout=30)
     resp.raise_for_status()
     return resp.content
-# -------------------------------------------------
-# Helper: convert MP4 → GIF using ezgif.com (public API)
-# -------------------------------------------------
 def mp4_to_gif(mp4_bytes: bytes) -> bytes:
-    """
-    Sends the MP4 bytes to ezgif.com and returns the resulting GIF bytes.
-    The API is undocumented but works via a simple multipart POST.
-    """
     files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
-    # ezgif.com endpoint for MP4 → GIF conversion
     resp = requests.post(
         "https://s.ezgif.com/video-to-gif",
         files=files,
         data={"file": "video.mp4"},
-        timeout=60,
     )
     resp.raise_for_status()
-    # The response HTML contains a link to the generated GIF.
-    # We extract the first <img src="..."> that ends with .gif
-    import re
     match = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text)
     if not match:
         raise RuntimeError("Failed to extract GIF URL from ezgif response")
     gif_url = match.group(1)
-    # ezgif serves the GIF from a relative path; make it absolute
     if gif_url.startswith("//"):
         gif_url = "https:" + gif_url
     elif gif_url.startswith("/"):
         gif_url = "https://s.ezgif.com" + gif_url
-    gif_resp = requests.get(gif_url, timeout=30)
     gif_resp.raise_for_status()
     return gif_resp.content
-# -------------------------------------------------
-# Main inference function
-# -------------------------------------------------
-def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
-    """
-    1. Download the resource.
-    2. If it is an MP4 → convert to GIF.
-    3. Load the first frame of the image/GIF.
-    4. Run JoyCaption and return the caption.
-    """
-    # -----------------------------------------------------------------
-    # 1️⃣ Download raw bytes
-    # -----------------------------------------------------------------
-    raw = download_bytes(url)
-    # -----------------------------------------------------------------
-    # 2️⃣ Determine type & possibly convert MP4 → GIF
-    # -----------------------------------------------------------------
-    lower_url = url.lower()
-    if lower_url.endswith(".mp4"):
-        # Convert video to GIF
-        raw = mp4_to_gif(raw)
-        # After conversion we treat it as a GIF
-        lower_url = ".gif"
-    # -----------------------------------------------------------------
-    # 3️⃣ Load image (first frame for GIFs)
-    # -----------------------------------------------------------------
     img = Image.open(BytesIO(raw))
-    # If the file is a multi‑frame GIF, pick the first frame
     if getattr(img, "is_animated", False):
         img = next(ImageSequence.Iterator(img))
-    # Ensure RGB (JoyCaption expects 3‑channel images)
     if img.mode != "RGB":
         img = img.convert("RGB")
-    # -----------------------------------------------------------------
-    # 4️⃣ Run the model
-    # -----------------------------------------------------------------
-    inputs = processor(images=img, text=prompt, return_tensors="pt")
-    inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        out_ids = llava_model.generate(**inputs, max_new_tokens=64)
-    caption = processor.decode(out_ids[0], skip_special_tokens=True)
-    return caption
-# -------------------------------------------------
-# Gradio UI
-# -------------------------------------------------
 iface = gr.Interface(
     fn=generate_caption_from_url,
     inputs=[
-        gr.Textbox(
-            label="Image / GIF / MP4 URL",
-            placeholder="https://example.com/photo.jpg or https://example.com/clip.mp4",
-        ),
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
-    title="JoyCaption – URL input (supports GIF & MP4)",
-    description=(
-        "Enter a direct URL to an image, an animated GIF, or an MP4 video. "
-        "MP4 files are automatically converted to GIF via ezgif.com, "
-        "and the first frame of the GIF is captioned."
-    ),
     allow_flagging="never",
 )

+import os
+import re
+import torch
 import requests
 from io import BytesIO
+from PIL import Image, ImageSequence
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+import gradio as gr
+MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"  # public repo
+# Optional: read HF token from secrets if you set HF_TOKEN in Space (not required for public repo)
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Load processor and model (CPU only)
+token_arg = {"token": HF_TOKEN} if HF_TOKEN else {}
+processor = AutoProcessor.from_pretrained(MODEL_NAME, **token_arg)
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
     torch_dtype=torch.bfloat16,
+    **token_arg
 )
 llava_model.eval()
+def download_bytes(url: str, timeout: int = 30) -> bytes:
+    resp = requests.get(url, stream=True, timeout=timeout)
     resp.raise_for_status()
     return resp.content
 def mp4_to_gif(mp4_bytes: bytes) -> bytes:
     files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
     resp = requests.post(
         "https://s.ezgif.com/video-to-gif",
         files=files,
         data={"file": "video.mp4"},
+        timeout=120,
     )
     resp.raise_for_status()
     match = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text)
+    if not match:
+        # try to extract via other img tags
+        match = re.search(r'src="([^"]+?/tmp/[^"]+\.gif)"', resp.text)
     if not match:
         raise RuntimeError("Failed to extract GIF URL from ezgif response")
     gif_url = match.group(1)
     if gif_url.startswith("//"):
         gif_url = "https:" + gif_url
     elif gif_url.startswith("/"):
         gif_url = "https://s.ezgif.com" + gif_url
+    gif_resp = requests.get(gif_url, timeout=60)
     gif_resp.raise_for_status()
     return gif_resp.content
+def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
     img = Image.open(BytesIO(raw))
     if getattr(img, "is_animated", False):
         img = next(ImageSequence.Iterator(img))
     if img.mode != "RGB":
         img = img.convert("RGB")
+    return img
+def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
+    if not url:
+        return "No URL provided."
+    try:
+        raw = download_bytes(url)
+    except Exception as e:
+        return f"Download error: {e}"
+    lower = url.lower().split("?")[0]
+    try:
+        if lower.endswith(".mp4") or b"ftyp" in raw[:16].lower():
+            try:
+                raw = mp4_to_gif(raw)
+            except Exception as e:
+                return f"MP4→GIF conversion failed: {e}"
+        img = load_first_frame_from_bytes(raw)
+    except Exception as e:
+        return f"Image processing error: {e}"
+    try:
+        inputs = processor(images=img, text=prompt, return_tensors="pt")
+        inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            out_ids = llava_model.generate(**inputs, max_new_tokens=128)
+        caption = processor.decode(out_ids[0], skip_special_tokens=True)
+        return caption
+    except Exception as e:
+        return f"Inference error: {e}"
 iface = gr.Interface(
     fn=generate_caption_from_url,
     inputs=[
+        gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
+    title="JoyCaption (public fancyfeast) - URL input",
+    description="Paste a direct link to an image, GIF, or MP4. MP4 files are converted to GIF via ezgif.com; the first frame is captioned.",
     allow_flagging="never",
 )