Spaces:

Hug0endob
/

Joycaption-basic

Build error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

7aed240

verified ·

1 Parent(s): 7e27c34

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -13

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-import re
 import io
 import torch
 import requests
 from PIL import Image, ImageSequence
@@ -10,7 +10,6 @@ import gradio as gr
 MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 HF_TOKEN = os.getenv("HF_TOKEN")  # optional
-# Helper: download bytes safely
 def download_bytes(url: str, timeout: int = 30) -> bytes:
     with requests.get(url, stream=True, timeout=timeout) as resp:
         resp.raise_for_status()
@@ -49,23 +48,22 @@ def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
 # Load processor + model
 token_arg = {"use_auth_token": HF_TOKEN} if HF_TOKEN else {}
-# Some HF model variants require trust_remote_code and num_additional_image_tokens
 processor = AutoProcessor.from_pretrained(
     MODEL_NAME,
     trust_remote_code=True,
-    num_additional_image_tokens=1,  # safe default for many forks that use a CLS token
     **({} if not HF_TOKEN else {"token": HF_TOKEN})
 )
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True,
     **({} if not HF_TOKEN else {"token": HF_TOKEN})
 )
 llava_model.eval()
-# Main inference
 def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
     if not url:
         return "No URL provided."
@@ -76,7 +74,6 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     lower = url.lower().split("?")[0]
     try:
-        # crude MP4 detection
         if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
             try:
                 raw = mp4_to_gif(raw)
@@ -86,14 +83,14 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     except Exception as e:
         return f"Image processing error: {e}"
-    # Resize to safe resolution expected by many VLMs (adjust if your model docs say otherwise)
     try:
         img = img.resize((512, 512), resample=Image.BICUBIC)
     except Exception:
         pass
     try:
-        # Build conversation/chat input so processor inserts image placeholder correctly
         conversation = [
             {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
         ]
@@ -104,13 +101,12 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
             return_dict=True,
             images=img,
         )
-        # Move to model device and match dtype for pixel values
         device = llava_model.device
         inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(dtype=llava_model.dtype, device=device)
-        # Debug shapes (helpful if mismatch persists)
         if "pixel_values" in inputs:
             print("pixel_values.shape:", inputs["pixel_values"].shape)
         if "input_ids" in inputs:
@@ -123,7 +119,6 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     except Exception as e:
         return f"Inference error: {e}"
-# Gradio UI
 gradio_kwargs = dict(
     fn=generate_caption_from_url,
     inputs=[
@@ -144,7 +139,6 @@ if __name__ == "__main__":
     try:
         iface.launch(server_name="0.0.0.0", server_port=7860)
     finally:
-        # close event loop safely in Spaces environment
         try:
             import asyncio
             loop = asyncio.get_event_loop()

 import os
 import io
+import re
 import torch
 import requests
 from PIL import Image, ImageSequence
 MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 HF_TOKEN = os.getenv("HF_TOKEN")  # optional
 def download_bytes(url: str, timeout: int = 30) -> bytes:
     with requests.get(url, stream=True, timeout=timeout) as resp:
         resp.raise_for_status()
 # Load processor + model
 token_arg = {"use_auth_token": HF_TOKEN} if HF_TOKEN else {}
 processor = AutoProcessor.from_pretrained(
     MODEL_NAME,
     trust_remote_code=True,
+    num_additional_image_tokens=1,
     **({} if not HF_TOKEN else {"token": HF_TOKEN})
 )
+# Use float32 on CPU; if CPU-only, torch.bfloat16 may not be supported
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
+    torch_dtype=torch.float32,
     trust_remote_code=True,
     **({} if not HF_TOKEN else {"token": HF_TOKEN})
 )
 llava_model.eval()
 def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
     if not url:
         return "No URL provided."
     lower = url.lower().split("?")[0]
     try:
         if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
             try:
                 raw = mp4_to_gif(raw)
     except Exception as e:
         return f"Image processing error: {e}"
+    # Resize to a conservative size (512) expected by many VLMs
     try:
         img = img.resize((512, 512), resample=Image.BICUBIC)
     except Exception:
         pass
     try:
+        # Use chat-like conversation so processor inserts image token correctly
         conversation = [
             {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
         ]
             return_dict=True,
             images=img,
         )
         device = llava_model.device
         inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(dtype=llava_model.dtype, device=device)
+        # Debug prints (will appear in Space logs)
         if "pixel_values" in inputs:
             print("pixel_values.shape:", inputs["pixel_values"].shape)
         if "input_ids" in inputs:
     except Exception as e:
         return f"Inference error: {e}"
 gradio_kwargs = dict(
     fn=generate_caption_from_url,
     inputs=[
     try:
         iface.launch(server_name="0.0.0.0", server_port=7860)
     finally:
         try:
             import asyncio
             loop = asyncio.get_event_loop()