import os import sys import types import importlib.machinery from PIL import Image import gradio as gr # =============================== # Make a PACKAGE-like dummy flash_attn # =============================== def _mk_pkg(name: str): m = types.ModuleType(name) # Mark as a package: give it a spec with submodule locations and a __path__ spec = importlib.machinery.ModuleSpec(name, loader=None, is_package=True) spec.submodule_search_locations = [] # important: tells importlib it's a package m.__spec__ = spec m.__path__ = [] # also marks as package return m # Root package flash_attn_pkg = _mk_pkg("flash_attn") # Submodule: flash_attn.flash_attn_interface flash_attn_interface = types.ModuleType("flash_attn.flash_attn_interface") flash_attn_interface.__spec__ = importlib.machinery.ModuleSpec( "flash_attn.flash_attn_interface", loader=None ) # Submodule: flash_attn.bert_padding flash_attn_bert_padding = types.ModuleType("flash_attn.bert_padding") flash_attn_bert_padding.__spec__ = importlib.machinery.ModuleSpec( "flash_attn.bert_padding", loader=None ) def _dummy_func(*args, **kwargs): # Should never be called on CPU; if it is, let’s fail loudly raise RuntimeError("flash_attn is not available in this environment.") # Functions some imports expect to exist: flash_attn_interface.flash_attn_unpadded_qkvpacked_func = _dummy_func flash_attn_interface.flash_attn_varlen_qkvpacked_func = _dummy_func flash_attn_bert_padding.pad_input = _dummy_func flash_attn_bert_padding.unpad_input = _dummy_func # Register modules sys.modules["flash_attn"] = flash_attn_pkg sys.modules["flash_attn.flash_attn_interface"] = flash_attn_interface sys.modules["flash_attn.bert_padding"] = flash_attn_bert_padding # =============================== # Runtime env (CPU-friendly) # =============================== os.environ.setdefault("FLASH_ATTENTION", "0") os.environ.setdefault("XFORMERS_DISABLED", "1") os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0") # Uncomment to force CPU even if a GPU is present: # os.environ.setdefault("CUDA_VISIBLE_DEVICES", "") # =============================== # VILA imports & load # =============================== from llava.model.builder import load_pretrained_model from llava.constants import DEFAULT_IMAGE_TOKEN MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b" tokenizer, model, image_processor, context_len = load_pretrained_model( MODEL_PATH, model_name="", model_base=None ) # Fallback chat template if missing if getattr(tokenizer, "chat_template", None) is None: tokenizer.chat_template = ( "{% for message in messages %}{{ message['role'] | upper }}: " "{{ message['content'] }}\n{% endfor %}ASSISTANT:" ) # =============================== # Inference function # =============================== def vila_infer(image, prompt): if image is None: return "Please upload an image." if not prompt.strip(): prompt = "Please describe the image." pil = Image.fromarray(image).convert("RGB") out = model.generate_content( prompt=[{ "from": "human", "value": [ {"type": "image", "value": pil}, {"type": "text", "value": prompt} ] }], generation_config=None ) return str(out) # =============================== # Gradio UI # =============================== with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo: gr.Markdown("## 🖼️ VILA-1.5-3B Image Description Demo\nUpload an image and get a description.") with gr.Row(): img = gr.Image(type="numpy", label="Image", height=320) prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2) btn = gr.Button("Run") out = gr.Textbox(label="Output", lines=8) btn.click(vila_infer, [img, prompt], out) demo.launch()