Spaces:

DivyanshHF
/

VisionLLM

Runtime error

App Files Files Community

DivyanshHF commited on Aug 10

Commit

c6110b4

verified ·

1 Parent(s): 61a367a

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -33

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import sys
 import types
@@ -6,67 +7,85 @@ from PIL import Image
 import gradio as gr
 # ===============================
-# Make a PACKAGE-like dummy flash_attn
 # ===============================
 def _mk_pkg(name: str):
     m = types.ModuleType(name)
-    # Mark as a package: give it a spec with submodule locations and a __path__
     spec = importlib.machinery.ModuleSpec(name, loader=None, is_package=True)
-    spec.submodule_search_locations = []  # important: tells importlib it's a package
     m.__spec__ = spec
-    m.__path__ = []  # also marks as package
     return m
-# Root package
 flash_attn_pkg = _mk_pkg("flash_attn")
-# Submodule: flash_attn.flash_attn_interface
 flash_attn_interface = types.ModuleType("flash_attn.flash_attn_interface")
 flash_attn_interface.__spec__ = importlib.machinery.ModuleSpec(
     "flash_attn.flash_attn_interface", loader=None
 )
-# Submodule: flash_attn.bert_padding
 flash_attn_bert_padding = types.ModuleType("flash_attn.bert_padding")
 flash_attn_bert_padding.__spec__ = importlib.machinery.ModuleSpec(
     "flash_attn.bert_padding", loader=None
 )
 def _dummy_func(*args, **kwargs):
-    # Should never be called on CPU; if it is, let’s fail loudly
     raise RuntimeError("flash_attn is not available in this environment.")
-# Functions some imports expect to exist:
 flash_attn_interface.flash_attn_unpadded_qkvpacked_func = _dummy_func
 flash_attn_interface.flash_attn_varlen_qkvpacked_func = _dummy_func
 flash_attn_bert_padding.pad_input = _dummy_func
 flash_attn_bert_padding.unpad_input = _dummy_func
-# Register modules
 sys.modules["flash_attn"] = flash_attn_pkg
 sys.modules["flash_attn.flash_attn_interface"] = flash_attn_interface
 sys.modules["flash_attn.bert_padding"] = flash_attn_bert_padding
 # ===============================
-# Runtime env (CPU-friendly)
 # ===============================
 os.environ.setdefault("FLASH_ATTENTION", "0")
 os.environ.setdefault("XFORMERS_DISABLED", "1")
 os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")
-# Uncomment to force CPU even if a GPU is present:
 # os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
 # ===============================
-# VILA imports & load
 # ===============================
 from llava.model.builder import load_pretrained_model
 from llava.constants import DEFAULT_IMAGE_TOKEN
 MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
-tokenizer, model, image_processor, context_len = load_pretrained_model(
-    MODEL_PATH, model_name="", model_base=None
-)
 # Fallback chat template if missing
 if getattr(tokenizer, "chat_template", None) is None:
@@ -76,38 +95,46 @@ if getattr(tokenizer, "chat_template", None) is None:
     )
 # ===============================
-# Inference function
 # ===============================
 def vila_infer(image, prompt):
     if image is None:
         return "Please upload an image."
-    if not prompt.strip():
         prompt = "Please describe the image."
     pil = Image.fromarray(image).convert("RGB")
-    out = model.generate_content(
-        prompt=[{
-            "from": "human",
-            "value": [
-                {"type": "image", "value": pil},
-                {"type": "text", "value": prompt}
-            ]
-        }],
-        generation_config=None
-    )
-    return str(out)
 # ===============================
 # Gradio UI
 # ===============================
 with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo:
-    gr.Markdown("## 🖼️ VILA-1.5-3B Image Description Demo\nUpload an image and get a description.")
     with gr.Row():
         img = gr.Image(type="numpy", label="Image", height=320)
         prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
-    btn = gr.Button("Run")
-    out = gr.Textbox(label="Output", lines=8)
-    btn.click(vila_infer, [img, prompt], out)
 demo.launch()

+# app.py
 import os
 import sys
 import types
 import gradio as gr
 # ===============================
+# Make dummy packages for flash_attn and ps3 (CPU-friendly import stubs)
 # ===============================
 def _mk_pkg(name: str):
     m = types.ModuleType(name)
     spec = importlib.machinery.ModuleSpec(name, loader=None, is_package=True)
+    spec.submodule_search_locations = []
     m.__spec__ = spec
+    m.__path__ = []
     return m
+# --- flash_attn package + submodules ---
 flash_attn_pkg = _mk_pkg("flash_attn")
 flash_attn_interface = types.ModuleType("flash_attn.flash_attn_interface")
 flash_attn_interface.__spec__ = importlib.machinery.ModuleSpec(
     "flash_attn.flash_attn_interface", loader=None
 )
 flash_attn_bert_padding = types.ModuleType("flash_attn.bert_padding")
 flash_attn_bert_padding.__spec__ = importlib.machinery.ModuleSpec(
     "flash_attn.bert_padding", loader=None
 )
 def _dummy_func(*args, **kwargs):
+    # Should never be called on CPU; if it is, fail loudly so we notice.
     raise RuntimeError("flash_attn is not available in this environment.")
 flash_attn_interface.flash_attn_unpadded_qkvpacked_func = _dummy_func
 flash_attn_interface.flash_attn_varlen_qkvpacked_func = _dummy_func
 flash_attn_bert_padding.pad_input = _dummy_func
 flash_attn_bert_padding.unpad_input = _dummy_func
 sys.modules["flash_attn"] = flash_attn_pkg
 sys.modules["flash_attn.flash_attn_interface"] = flash_attn_interface
 sys.modules["flash_attn.bert_padding"] = flash_attn_bert_padding
+# --- ps3 package stub ---
+ps3_pkg = _mk_pkg("ps3")
+class _PS3Config: pass
+class _PS3VisionConfig: pass
+class _PS3ImageProcessor: pass
+class _PS3VisionModel: pass
+ps3_pkg.PS3Config = _PS3Config
+ps3_pkg.PS3VisionConfig = _PS3VisionConfig
+ps3_pkg.PS3ImageProcessor = _PS3ImageProcessor
+ps3_pkg.PS3VisionModel = _PS3VisionModel
+sys.modules["ps3"] = ps3_pkg
 # ===============================
+# Runtime env (CPU-safe defaults)
 # ===============================
 os.environ.setdefault("FLASH_ATTENTION", "0")
 os.environ.setdefault("XFORMERS_DISABLED", "1")
 os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")
+# Uncomment to force CPU even if a GPU is present on the Space
 # os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
 # ===============================
+# VILA imports & model load
 # ===============================
 from llava.model.builder import load_pretrained_model
 from llava.constants import DEFAULT_IMAGE_TOKEN
 MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
+try:
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        MODEL_PATH, model_name="", model_base=None
+    )
+except Exception as e:
+    # Surface a friendly error on the UI instead of crashing the Space
+    ERR = f"Failed to load model '{MODEL_PATH}': {e}"
+    def _boot_error_ui():
+        with gr.Blocks(title="VILA 1.5 3B – Error") as demo:
+            gr.Markdown("### ❌ Model failed to load")
+            gr.Markdown(ERR)
+        demo.launch()
+    _boot_error_ui()
+    raise
 # Fallback chat template if missing
 if getattr(tokenizer, "chat_template", None) is None:
     )
 # ===============================
+# Inference
 # ===============================
 def vila_infer(image, prompt):
     if image is None:
         return "Please upload an image."
+    if not prompt or not str(prompt).strip():
         prompt = "Please describe the image."
     pil = Image.fromarray(image).convert("RGB")
+    # Minimal multimodal conversation: image + text
+    try:
+        out = model.generate_content(
+            prompt=[{
+                "from": "human",
+                "value": [
+                    {"type": "image", "value": pil},
+                    {"type": "text", "value": prompt}
+                ]
+            }],
+            generation_config=None  # use model defaults
+        )
+        return str(out).strip()
+    except Exception as e:
+        return f"❌ Inference error: {e}"
 # ===============================
 # Gradio UI
 # ===============================
 with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo:
+    gr.Markdown("## 🖼️ VILA-1.5-3B — Image Description Demo")
+    gr.Markdown("Upload an image and press **Run**. Leave the prompt as default for simple captioning.")
     with gr.Row():
         img = gr.Image(type="numpy", label="Image", height=320)
         prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
+    run_btn = gr.Button("Run")
+    out = gr.Textbox(label="Output", lines=10)
+    run_btn.click(vila_infer, [img, prompt], out)
 demo.launch()