Spaces:

DivyanshHF
/

VisionLLM

Runtime error

App Files Files Community

DivyanshHF commited on Aug 10

Commit

279f604

verified ·

1 Parent(s): 3f3111c

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -24

app.py CHANGED Viewed

@@ -1,65 +1,74 @@
-import os, io
 import gradio as gr
 from PIL import Image
-# Make runtime conservative (avoid native kernel issues on shared GPUs)
-os.environ.setdefault("FLASH_ATTENTION", "0")
-os.environ.setdefault("XFORMERS_DISABLED", "1")
-os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")
-# ---- VILA imports (from the repo installed via requirements.txt)
 from llava.model.builder import load_pretrained_model
 from llava.constants import DEFAULT_IMAGE_TOKEN
-# --- Load VILA-1.5-3B once
 MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
-# Some builds need a non-None model_name; empty string is fine
 tokenizer, model, image_processor, context_len = load_pretrained_model(
     MODEL_PATH, model_name="", model_base=None
 )
-# Fallback chat template (some checkpoints don’t ship one)
 if getattr(tokenizer, "chat_template", None) is None:
     tokenizer.chat_template = (
         "{% for message in messages %}{{ message['role'] | upper }}: "
         "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
     )
 def vila_infer(image, prompt, max_new_tokens, temperature):
     if image is None:
-        return "Please upload an image."
     if not prompt.strip():
         prompt = "Please describe the image."
-    # VILA expects a “conversation” with mixed media.
-    # We pass both the image and the text. The model code will find the image
-    # and insert media tokens automatically.
-    # (Under the hood it looks for DEFAULT_IMAGE_TOKEN or a media dict.)
     pil = Image.fromarray(image).convert("RGB")
-    # Minimal prompt: put the <image> token then your question
-    user_prompt = f"{DEFAULT_IMAGE_TOKEN}\n{prompt}"
-    # Let VILA handle preprocessing & generation
     out = model.generate_content(
-        prompt=[{"from":"human","value":[{"type":"image","value":pil},
-                                        {"type":"text","value":prompt}]}],
         generation_config=None
     )
-    # Some versions return plain text; others return dicts. Normalize:
     return str(out)
-with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo:
-    gr.Markdown("## 🖼️ VILA-1.5-3B Demo\nUpload an image and ask a question.")
     with gr.Row():
         img = gr.Image(type="numpy", label="Image", height=320)
         prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
     with gr.Row():
         max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens")
         temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
     btn = gr.Button("Run")
     out = gr.Textbox(label="Output", lines=8)
     btn.click(vila_infer, [img, prompt, max_new, temp], out)
 demo.launch()

+import os
+# ===== Disable GPU-specific optional deps for Hugging Face Spaces =====
+os.environ["FLASH_ATTENTION"] = "0"
+os.environ["DISABLE_FLASH_ATTN"] = "1"
+os.environ["XFORMERS_DISABLED"] = "1"
+os.environ["ACCELERATE_USE_DEVICE_MAP"] = "0"
+# Optional: force CPU if GPU not available
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import gradio as gr
 from PIL import Image
+# ---- VILA imports ----
 from llava.model.builder import load_pretrained_model
 from llava.constants import DEFAULT_IMAGE_TOKEN
+# === Load VILA 1.5-3B ===
 MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
 tokenizer, model, image_processor, context_len = load_pretrained_model(
     MODEL_PATH, model_name="", model_base=None
 )
+# === Fallback chat template (in case checkpoint doesn't have one) ===
 if getattr(tokenizer, "chat_template", None) is None:
     tokenizer.chat_template = (
         "{% for message in messages %}{{ message['role'] | upper }}: "
         "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
     )
+# === Inference function ===
 def vila_infer(image, prompt, max_new_tokens, temperature):
     if image is None:
+        return "❌ Please upload an image."
     if not prompt.strip():
         prompt = "Please describe the image."
     pil = Image.fromarray(image).convert("RGB")
+    # Prepare multimodal input for VILA
+    conversation = [{
+        "from": "human",
+        "value": [
+            {"type": "image", "value": pil},
+            {"type": "text", "value": prompt}
+        ]
+    }]
+    # Generate output
     out = model.generate_content(
+        prompt=conversation,
         generation_config=None
     )
     return str(out)
+# === Gradio UI ===
+with gr.Blocks(title="VILA 1.5 3B Demo") as demo:
+    gr.Markdown("## 🖼️ VILA-1.5-3B — Image Understanding Demo\nUpload an image and ask a question.")
     with gr.Row():
         img = gr.Image(type="numpy", label="Image", height=320)
         prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
     with gr.Row():
         max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens")
         temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
     btn = gr.Button("Run")
     out = gr.Textbox(label="Output", lines=8)
     btn.click(vila_infer, [img, prompt, max_new, temp], out)
 demo.launch()