Spaces:

DivyanshHF
/

VisionLLM

Runtime error

App Files Files Community

DivyanshHF commited on Aug 10

Commit

a709033

verified ·

1 Parent(s): d3ff5e8

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -30

app.py CHANGED Viewed

@@ -1,62 +1,72 @@
 import os
-# ===== Disable GPU-specific optional deps for Hugging Face Spaces =====
-os.environ["FLASH_ATTENTION"] = "0"
-os.environ["DISABLE_FLASH_ATTN"] = "1"
-os.environ["XFORMERS_DISABLED"] = "1"
-os.environ["ACCELERATE_USE_DEVICE_MAP"] = "0"
-# Optional: force CPU if GPU not available
-# os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import gradio as gr
 from PIL import Image
-# ---- VILA imports ----
 from llava.model.builder import load_pretrained_model
 from llava.constants import DEFAULT_IMAGE_TOKEN
-# === Load VILA 1.5-3B ===
 MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
 tokenizer, model, image_processor, context_len = load_pretrained_model(
     MODEL_PATH, model_name="", model_base=None
 )
-# === Fallback chat template (in case checkpoint doesn't have one) ===
 if getattr(tokenizer, "chat_template", None) is None:
     tokenizer.chat_template = (
         "{% for message in messages %}{{ message['role'] | upper }}: "
         "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
     )
-# === Inference function ===
 def vila_infer(image, prompt, max_new_tokens, temperature):
     if image is None:
-        return "❌ Please upload an image."
     if not prompt.strip():
         prompt = "Please describe the image."
     pil = Image.fromarray(image).convert("RGB")
-    # Prepare multimodal input for VILA
-    conversation = [{
-        "from": "human",
-        "value": [
-            {"type": "image", "value": pil},
-            {"type": "text", "value": prompt}
-        ]
-    }]
-    # Generate output
     out = model.generate_content(
-        prompt=conversation,
-        generation_config=None
     )
     return str(out)
-# === Gradio UI ===
-with gr.Blocks(title="VILA 1.5 3B Demo") as demo:
-    gr.Markdown("## 🖼️ VILA-1.5-3B — Image Understanding Demo\nUpload an image and ask a question.")
     with gr.Row():
         img = gr.Image(type="numpy", label="Image", height=320)

 import os
+import sys
+import types
 import gradio as gr
 from PIL import Image
+# ======================
+#  Disable FlashAttention
+# ======================
+sys.modules["flash_attn"] = types.ModuleType("flash_attn")
+sys.modules["flash_attn.flash_attn_interface"] = types.ModuleType("flash_attn.flash_attn_interface")
+def _dummy_func(*args, **kwargs):
+    raise RuntimeError("FlashAttention is not available in this environment.")
+sys.modules["flash_attn.flash_attn_interface"].flash_attn_unpadded_qkvpacked_func = _dummy_func
+sys.modules["flash_attn.flash_attn_interface"].flash_attn_varlen_qkvpacked_func = _dummy_func
+# ======================
+#  CPU-only settings
+# ======================
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
+os.environ.setdefault("FLASH_ATTENTION", "0")
+os.environ.setdefault("XFORMERS_DISABLED", "1")
+os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")
+# ======================
+#  VILA imports
+# ======================
 from llava.model.builder import load_pretrained_model
 from llava.constants import DEFAULT_IMAGE_TOKEN
 MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
 tokenizer, model, image_processor, context_len = load_pretrained_model(
     MODEL_PATH, model_name="", model_base=None
 )
+# Add fallback chat template if missing
 if getattr(tokenizer, "chat_template", None) is None:
     tokenizer.chat_template = (
         "{% for message in messages %}{{ message['role'] | upper }}: "
         "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
     )
 def vila_infer(image, prompt, max_new_tokens, temperature):
     if image is None:
+        return "Please upload an image."
     if not prompt.strip():
         prompt = "Please describe the image."
     pil = Image.fromarray(image).convert("RGB")
+    # Minimal conversation: image + prompt
     out = model.generate_content(
+        prompt=[{
+            "from": "human",
+            "value": [
+                {"type": "image", "value": pil},
+                {"type": "text", "value": prompt}
+            ]
+        }],
+        generation_config={"max_new_tokens": max_new_tokens, "temperature": temperature}
     )
     return str(out)
+with gr.Blocks(title="VILA 1.5 3B (CPU, HF Space)") as demo:
+    gr.Markdown("## 🖼️ VILA-1.5-3B — Image Captioning\nUpload an image and get a description.")
     with gr.Row():
         img = gr.Image(type="numpy", label="Image", height=320)