Spaces:

Hug0endob
/

Joycaption-basic

Build error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

49d3ba7

verified ·

1 Parent(s): 10d27da

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -100

app.py CHANGED Viewed

@@ -1,44 +1,28 @@
 import os
 import io
-import re
-import torch
 import requests
 from PIL import Image, ImageSequence
-from transformers import AutoProcessor, LlavaForConditionalGeneration
 import gradio as gr
-MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"
-HF_TOKEN = os.getenv("HF_TOKEN")  # optional
 def download_bytes(url: str, timeout: int = 30) -> bytes:
     with requests.get(url, stream=True, timeout=timeout) as resp:
         resp.raise_for_status()
         return resp.content
-def mp4_to_gif(mp4_bytes: bytes) -> bytes:
-    files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
-    resp = requests.post(
-        "https://s.ezgif.com/video-to-gif",
-        files=files,
-        data={"file": "video.mp4"},
-        timeout=120,
-    )
-    resp.raise_for_status()
-    match = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text)
-    if not match:
-        match = re.search(r'src="([^"]+?/tmp/[^"]+\.gif)"', resp.text)
-    if not match:
-        raise RuntimeError("Failed to extract GIF URL from ezgif response")
-    gif_url = match.group(1)
-    if gif_url.startswith("//"):
-        gif_url = "https:" + gif_url
-    elif gif_url.startswith("/"):
-        gif_url = "https://s.ezgif.com" + gif_url
-    with requests.get(gif_url, timeout=60) as gif_resp:
-        gif_resp.raise_for_status()
-        return gif_resp.content
-def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
     img = Image.open(io.BytesIO(raw))
     if getattr(img, "is_animated", False):
         img = next(ImageSequence.Iterator(img))
@@ -46,102 +30,69 @@ def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
         img = img.convert("RGB")
     return img
-# Load processor + model
-token_arg = {"use_auth_token": HF_TOKEN} if HF_TOKEN else {}
-processor = AutoProcessor.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True,
-    num_additional_image_tokens=1,
-    **({} if not HF_TOKEN else {"token": HF_TOKEN})
-)
-# CPU Space -> use float32
-llava_model = LlavaForConditionalGeneration.from_pretrained(
-    MODEL_NAME,
-    device_map="cpu",
-    torch_dtype=torch.float32,
-    trust_remote_code=True,
-    **({} if not HF_TOKEN else {"token": HF_TOKEN})
-)
-llava_model.eval()
-def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
     if not url:
         return "No URL provided."
     try:
         raw = download_bytes(url)
     except Exception as e:
         return f"Download error: {e}"
-    lower = url.lower().split("?")[0]
     try:
-        if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
-            try:
-                raw = mp4_to_gif(raw)
-            except Exception as e:
-                return f"MP4→GIF conversion failed: {e}"
         img = load_first_frame_from_bytes(raw)
     except Exception as e:
         return f"Image processing error: {e}"
-    # Resize to conservative default
     try:
-        img = img.resize((512, 512), resample=Image.BICUBIC)
-    except Exception:
-        pass
     try:
-        conversation = [
-            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
-        ]
-        inputs = processor.apply_chat_template(
-            conversation,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            return_dict=True,
-            images=img,
         )
-        device = llava_model.device
-        inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
-        if "pixel_values" in inputs:
-            inputs["pixel_values"] = inputs["pixel_values"].to(dtype=llava_model.dtype, device=device)
-        # Minimal debug info (appears in Space logs)
-        if "pixel_values" in inputs:
-            print("pixel_values.shape:", inputs["pixel_values"].shape)
-        if "input_ids" in inputs:
-            print("input_ids.shape:", inputs["input_ids"].shape)
-        with torch.no_grad():
-            out_ids = llava_model.generate(**inputs, max_new_tokens=128)
-        caption = processor.decode(out_ids[0], skip_special_tokens=True)
-        return caption
     except Exception as e:
         return f"Inference error: {e}"
-gradio_kwargs = dict(
     fn=generate_caption_from_url,
     inputs=[
         gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
-    title="JoyCaption - URL input",
-    description="Paste a direct link to an image/GIF/MP4 (MP4 will be converted).",
 )
-try:
-    iface = gr.Interface(**gradio_kwargs, allow_flagging="never")
-except TypeError:
-    iface = gr.Interface(**gradio_kwargs)
 if __name__ == "__main__":
-    try:
-        iface.launch(server_name="0.0.0.0", server_port=7860)
-    finally:
-        try:
-            import asyncio
-            loop = asyncio.get_event_loop()
-            if not loop.is_closed():
-                loop.close()
-        except Exception:
-            pass

 import os
 import io
+import sys
+import time
 import requests
 from PIL import Image, ImageSequence
 import gradio as gr
+# Try to import llama-cpp-python
+try:
+    from llama_cpp import Llama
+except Exception as e:
+    raise RuntimeError("llama-cpp-python import failed; ensure requirements installed and wheel built: " + str(e))
+MODEL_PATH = os.path.join("model", "model.gguf")  # start.sh places GGUF here
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Set correct GGUF in start.sh and redeploy.")
+# Helper: load first frame and convert to JPEG bytes
 def download_bytes(url: str, timeout: int = 30) -> bytes:
     with requests.get(url, stream=True, timeout=timeout) as resp:
         resp.raise_for_status()
         return resp.content
+def load_first_frame_from_bytes(raw: bytes):
     img = Image.open(io.BytesIO(raw))
     if getattr(img, "is_animated", False):
         img = next(ImageSequence.Iterator(img))
         img = img.convert("RGB")
     return img
+# Minimal image caption prompt template — adjust for your model's expected prompt
+def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
+    # Many llama.cpp-based multimodal ggufs accept: "<img>{path}</img>\nUser: {prompt}\nAssistant:"
+    # We'll use that pattern.
+    return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"
+# Start model (llama-cpp-python will mmap model and run inference)
+# Use low-memory opts: n_ctx small, use_mlock=0, n_gpu_layers=0
+print("Loading model (this may take a minute)...", file=sys.stderr)
+llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
+def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
     if not url:
         return "No URL provided."
     try:
         raw = download_bytes(url)
     except Exception as e:
         return f"Download error: {e}"
     try:
         img = load_first_frame_from_bytes(raw)
     except Exception as e:
         return f"Image processing error: {e}"
+    # Save a temporary JPEG locally so the gguf image token loader can access it
+    tmp_dir = "/tmp/joycap"
+    os.makedirs(tmp_dir, exist_ok=True)
+    ts = int(time.time() * 1000)
+    tmp_path = os.path.join(tmp_dir, f"{ts}.jpg")
     try:
+        img.save(tmp_path, format="JPEG", quality=85)
+    except Exception as e:
+        return f"Failed to save temp image: {e}"
+    prompt_full = make_prompt_for_image(tmp_path, prompt)
     try:
+        # llama-cpp-python generate call
+        resp = llm.create(
+            prompt=prompt_full,
+            max_tokens=256,
+            temperature=0.2,
+            top_p=0.95,
+            stop=["User:", "Assistant:"],
         )
+        text = resp.get("choices", [{}])[0].get("text", "").strip()
+        return text or "No caption generated."
     except Exception as e:
         return f"Inference error: {e}"
+    finally:
+        try:
+            os.remove(tmp_path)
+        except Exception:
+            pass
+iface = gr.Interface(
     fn=generate_caption_from_url,
     inputs=[
         gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
+    title="JoyCaption - local GGUF (Q4)",
+    description="Runs a quantized GGUF model locally via llama.cpp (no external APIs). Ensure the GGUF in start.sh is a multimodal model that supports <img> tags.",
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)