Spaces:

Hug0endob
/

Joycaption-basic

Build error

Hug0endob commited on Dec 14, 2025

Commit

35d219a

verified ·

1 Parent(s): 7aed240

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -54,7 +54,7 @@ processor = AutoProcessor.from_pretrained(
     num_additional_image_tokens=1,
     **({} if not HF_TOKEN else {"token": HF_TOKEN})
 )
-# Use float32 on CPU; if CPU-only, torch.bfloat16 may not be supported
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
@@ -83,14 +83,13 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
     except Exception as e:
         return f"Image processing error: {e}"
-    # Resize to a conservative size (512) expected by many VLMs
     try:
         img = img.resize((512, 512), resample=Image.BICUBIC)
     except Exception:
         pass
     try:
-        # Use chat-like conversation so processor inserts image token correctly
         conversation = [
             {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
         ]
@@ -106,7 +105,7 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image.") ->
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(dtype=llava_model.dtype, device=device)
-        # Debug prints (will appear in Space logs)
         if "pixel_values" in inputs:
             print("pixel_values.shape:", inputs["pixel_values"].shape)
         if "input_ids" in inputs:

     num_additional_image_tokens=1,
     **({} if not HF_TOKEN else {"token": HF_TOKEN})
 )
+# CPU Space -> use float32
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
     except Exception as e:
         return f"Image processing error: {e}"
+    # Resize to conservative default
     try:
         img = img.resize((512, 512), resample=Image.BICUBIC)
     except Exception:
         pass
     try:
         conversation = [
             {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
         ]
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(dtype=llava_model.dtype, device=device)
+        # Minimal debug info (appears in Space logs)
         if "pixel_values" in inputs:
             print("pixel_values.shape:", inputs["pixel_values"].shape)
         if "input_ids" in inputs: