Spaces:

Rausda6
/

LLaVADinov2

Runtime error

App Files Files Community

Rausda6 commited on Aug 17, 2025

Commit

ff3e8f4

verified ·

1 Parent(s): 16dfb59

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -87

app.py CHANGED Viewed

@@ -1,47 +1,30 @@
 import os
-# Disable Xet/CAS backend (it’s what’s throwing the error)
 os.environ["HF_HUB_ENABLE_XET"] = "0"
-# Use the robust Rust downloader for big files
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-# Optional but helpful: resume and avoid symlinks on some filesystems
 os.environ["HF_HUB_ENABLE_RESUME"] = "1"
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
 import gradio as gr
 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
-# Hugging Face model identifier.  See the model card for more details:
-# https://huggingface.co/StarCycle/llava-dinov2-internlm2-7b-v1
 MODEL_ID = "xtuner/llava-phi-3-mini-hf"
-# Determine the computation device.  If a CUDA‑enabled GPU is
-# available we will use it and cast the weights to half precision to
-# reduce memory consumption.  Otherwise we fall back to CPU.
 if torch.cuda.is_available():
-    DEVICE = torch.device("cuda")
     TORCH_DTYPE = torch.float16
 else:
-    DEVICE = torch.device("cpu")
     TORCH_DTYPE = torch.float32
 def load_model():
-    """Load the LLaVA model and its processor.
-    The model is loaded with ``trust_remote_code=True`` to allow the
-    repository’s custom projector and adapter classes to be registered
-    correctly.  We specify ``device_map='auto'`` so that the
-    ``accelerate`` library will distribute the model across the
-    available hardware (GPU/CPU) automatically.  The ``torch_dtype``
-    argument ensures that the model weights are loaded in half
-    precision on a GPU and in full precision on a CPU.
     """
     model = LlavaForConditionalGeneration.from_pretrained(
         MODEL_ID,
@@ -51,93 +34,99 @@ def load_model():
         low_cpu_mem_usage=True,
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     return model, processor
-# Load the model and processor at import time.  Loading is expensive so
-# we only do it once.  If the model fails to load (for example
-# because of missing dependencies) the exception will be raised here.
 MODEL, PROCESSOR = load_model()
 def answer_question(image: Image.Image, question: str) -> str:
-    """Generate an answer for the given question about the uploaded image.
-    Parameters
-    ----------
-    image: PIL.Image.Image
-        The user‑provided image.  Gradio supplies images as PIL
-        objects, which the LLaVA processor accepts directly.
-    question: str
-        The user’s question about the image.
-    Returns
-    -------
-    str
-        The answer generated by the model.  If either the image or
-        question is missing, an explanatory message is returned.
     """
-    # Basic validation: ensure both inputs are provided.
     if image is None:
         return "Please upload an image."
     if not question or not question.strip():
         return "Please enter a question about the image."
-    # Build the chat prompt.  The LLaVA model uses the ``<image>``
-    # placeholder to indicate where the image will be inserted.
-    prompt = f"USER: <image>\n{question.strip()} ASSISTANT:"
-    # Tokenize the inputs.  The processor will process both the image
-    # and the text and return PyTorch tensors.  We move these to the
-    # same device as the model to avoid device mismatch errors.
-    inputs = PROCESSOR(
-        images=image,
-        text=prompt,
-        return_tensors="pt",
-    )
-    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
-    # Generate the answer.  We limit the number of new tokens to 256 to
-    # avoid excessive memory usage.  Feel free to adjust this value
-    # depending on your hardware constraints and desired response length.
-    with torch.no_grad():
         generated_ids = MODEL.generate(
             **inputs,
             max_new_tokens=256,
             do_sample=False,
         )
-    # Decode the generated ids back into text.  The output will include
-    # the entire conversation (e.g., ``USER: ... ASSISTANT: ...``).
-    output = PROCESSOR.batch_decode(
         generated_ids,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True,
     )[0]
-    # Extract the assistant's response by splitting on the
-    # ``ASSISTANT:`` delimiter.
-    if "ASSISTANT:" in output:
-        answer = output.split("ASSISTANT:")[-1].strip()
-    else:
-        # Fallback if the delimiter is not present.
-        answer = output.strip()
-    return answer
 def build_interface() -> gr.Interface:
-    """Construct the Gradio Interface object for the app."""
     description = (
         "Upload an image and ask a question about it.\n\n"
-        "This demo uses the multimodal model "
-        "StarCycle/llava‑dinov2‑internlm2‑7b‑v1 to perform visual "
-        "question answering.  The model combines the Dinov2 vision encoder with "
-        "the InternLM2‑Chat‑7B language model via a lightweight projector and "
-        "LoRA adapters.  Note: inference requires a GPU with sufficient "
-        "memory; on a CPU the generation will be extremely slow."
     )
-    iface = gr.Interface(
         fn=answer_question,
         inputs=[
             gr.Image(type="pil", label="Image"),
@@ -148,22 +137,16 @@ def build_interface() -> gr.Interface:
             ),
         ],
         outputs=gr.Textbox(label="Answer"),
-        title="Visual Question Answering with LLaVA Dinov2 InternLM2 7B",
         description=description,
         flagging_mode="never",
     )
-    return iface
 def main() -> None:
-    """Launch the Gradio app."""
     iface = build_interface()
-    # When running on Hugging Face Spaces the app will automatically set
-    # the appropriate host and port.  For local development you can
-    # uncomment the ``server_name`` argument to make the app reachable
-    # from other machines on your network.
     iface.launch()
 if __name__ == "__main__":
-    main()

 import os
+# ---- Hub download settings (apply before any HF imports) ----
 os.environ["HF_HUB_ENABLE_XET"] = "0"
+os.environ["HF_HUB_DISABLE_XET"] = "1"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 os.environ["HF_HUB_ENABLE_RESUME"] = "1"
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
 import gradio as gr
 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
+# Use the compact HF-format LLaVA model
 MODEL_ID = "xtuner/llava-phi-3-mini-hf"
+# Device + dtype
 if torch.cuda.is_available():
     TORCH_DTYPE = torch.float16
 else:
     TORCH_DTYPE = torch.float32
 def load_model():
+    """
+    Load the LLaVA model and its processor.
     """
     model = LlavaForConditionalGeneration.from_pretrained(
         MODEL_ID,
         low_cpu_mem_usage=True,
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+    # ---- Robustness: ensure processor carries vision attrs expected by LLaVA ----
+    vcfg = getattr(model.config, "vision_config", None)
+    if not hasattr(processor, "patch_size") or processor.patch_size is None:
+        # CLIP-L/336 typically uses patch_size=14; default to 14 if missing
+        processor.patch_size = getattr(vcfg, "patch_size", 14)
+    if (
+        not hasattr(processor, "vision_feature_select_strategy")
+        or processor.vision_feature_select_strategy is None
+    ):
+        processor.vision_feature_select_strategy = getattr(
+            model.config, "vision_feature_select_strategy", "default"
+        )
+    if (
+        not hasattr(processor, "num_additional_image_tokens")
+        or processor.num_additional_image_tokens is None
+    ):
+        # CLIP ViT uses a single CLS token
+        processor.num_additional_image_tokens = 1
     return model, processor
+# Load once at import
 MODEL, PROCESSOR = load_model()
 def answer_question(image: Image.Image, question: str) -> str:
     """
+    Generate an answer about the uploaded image.
+    """
     if image is None:
         return "Please upload an image."
     if not question or not question.strip():
         return "Please enter a question about the image."
+    try:
+        # ---- Preferred: chat-template path (handles image + text cleanly) ----
+        conversation = [{
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": question.strip()},
+            ],
+        }]
+        inputs = PROCESSOR.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            images=[image],
+        )
+    except Exception:
+        # ---- Fallback: legacy prompt with <image> placeholder ----
+        prompt = f"USER: <image>\n{question.strip()} ASSISTANT:"
+        inputs = PROCESSOR(
+            images=image,
+            text=prompt,
+            return_tensors="pt",
+        )
+    # Move all tensors to the model's device
+    inputs = {k: (v.to(MODEL.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+    with torch.inference_mode():
         generated_ids = MODEL.generate(
             **inputs,
             max_new_tokens=256,
             do_sample=False,
         )
+    text = PROCESSOR.batch_decode(
         generated_ids,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True,
     )[0]
+    return text.strip()
 def build_interface() -> gr.Interface:
     description = (
         "Upload an image and ask a question about it.\n\n"
+        "This demo uses **xtuner/llava-phi-3-mini-hf** (LLaVA in HF format) "
+        "to perform visual question answering. Note: a GPU is recommended; "
+        "CPU inference will be slow."
     )
+    return gr.Interface(
         fn=answer_question,
         inputs=[
             gr.Image(type="pil", label="Image"),
             ),
         ],
         outputs=gr.Textbox(label="Answer"),
+        title="Visual Question Answering (LLaVA Phi-3 Mini)",
         description=description,
         flagging_mode="never",
     )
 def main() -> None:
     iface = build_interface()
     iface.launch()
 if __name__ == "__main__":
+    main()