import os, torch from transformers import AutoProcessor, LlavaForConditionalGeneration import gradio as gr from PIL import Image, ImageSequence import requests from io import BytesIO # ---- 1️⃣ Use a public repo ---- MODEL_NAME = "llava-hf/joycaption-llama3.1-8b" # public version processor = AutoProcessor.from_pretrained(MODEL_NAME) llava_model = LlavaForConditionalGeneration.from_pretrained( MODEL_NAME, device_map="cpu", torch_dtype=torch.bfloat16, ) llava_model.eval() # ------------------------------------------------- # Helper: download a file from a URL # ------------------------------------------------- def download_bytes(url: str) -> bytes: resp = requests.get(url, stream=True, timeout=30) resp.raise_for_status() return resp.content # ------------------------------------------------- # Helper: convert MP4 → GIF using ezgif.com (public API) # ------------------------------------------------- def mp4_to_gif(mp4_bytes: bytes) -> bytes: """ Sends the MP4 bytes to ezgif.com and returns the resulting GIF bytes. The API is undocumented but works via a simple multipart POST. """ files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")} # ezgif.com endpoint for MP4 → GIF conversion resp = requests.post( "https://s.ezgif.com/video-to-gif", files=files, data={"file": "video.mp4"}, timeout=60, ) resp.raise_for_status() # The response HTML contains a link to the generated GIF. # We extract the first that ends with .gif import re match = re.search(r']+src="([^"]+\.gif)"', resp.text) if not match: raise RuntimeError("Failed to extract GIF URL from ezgif response") gif_url = match.group(1) # ezgif serves the GIF from a relative path; make it absolute if gif_url.startswith("//"): gif_url = "https:" + gif_url elif gif_url.startswith("/"): gif_url = "https://s.ezgif.com" + gif_url gif_resp = requests.get(gif_url, timeout=30) gif_resp.raise_for_status() return gif_resp.content # ------------------------------------------------- # Main inference function # ------------------------------------------------- def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str: """ 1. Download the resource. 2. If it is an MP4 → convert to GIF. 3. Load the first frame of the image/GIF. 4. Run JoyCaption and return the caption. """ # ----------------------------------------------------------------- # 1️⃣ Download raw bytes # ----------------------------------------------------------------- raw = download_bytes(url) # ----------------------------------------------------------------- # 2️⃣ Determine type & possibly convert MP4 → GIF # ----------------------------------------------------------------- lower_url = url.lower() if lower_url.endswith(".mp4"): # Convert video to GIF raw = mp4_to_gif(raw) # After conversion we treat it as a GIF lower_url = ".gif" # ----------------------------------------------------------------- # 3️⃣ Load image (first frame for GIFs) # ----------------------------------------------------------------- img = Image.open(BytesIO(raw)) # If the file is a multi‑frame GIF, pick the first frame if getattr(img, "is_animated", False): img = next(ImageSequence.Iterator(img)) # Ensure RGB (JoyCaption expects 3‑channel images) if img.mode != "RGB": img = img.convert("RGB") # ----------------------------------------------------------------- # 4️⃣ Run the model # ----------------------------------------------------------------- inputs = processor(images=img, text=prompt, return_tensors="pt") inputs = {k: v.to(llava_model.device) for k, v in inputs.items()} with torch.no_grad(): out_ids = llava_model.generate(**inputs, max_new_tokens=64) caption = processor.decode(out_ids[0], skip_special_tokens=True) return caption # ------------------------------------------------- # Gradio UI # ------------------------------------------------- iface = gr.Interface( fn=generate_caption_from_url, inputs=[ gr.Textbox( label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg or https://example.com/clip.mp4", ), gr.Textbox(label="Prompt (optional)", value="Describe the image."), ], outputs=gr.Textbox(label="Generated caption"), title="JoyCaption – URL input (supports GIF & MP4)", description=( "Enter a direct URL to an image, an animated GIF, or an MP4 video. " "MP4 files are automatically converted to GIF via ezgif.com, " "and the first frame of the GIF is captioned." ), allow_flagging="never", ) if __name__ == "__main__": iface.launch()