Spaces:
Runtime error
Runtime error
| import os, io | |
| import gradio as gr | |
| from PIL import Image | |
| # Make runtime conservative (avoid native kernel issues on shared GPUs) | |
| os.environ.setdefault("FLASH_ATTENTION", "0") | |
| os.environ.setdefault("XFORMERS_DISABLED", "1") | |
| os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0") | |
| # ---- VILA imports (from the repo installed via requirements.txt) | |
| from llava.model.builder import load_pretrained_model | |
| from llava.constants import DEFAULT_IMAGE_TOKEN | |
| # --- Load VILA-1.5-3B once | |
| MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b" | |
| # Some builds need a non-None model_name; empty string is fine | |
| tokenizer, model, image_processor, context_len = load_pretrained_model( | |
| MODEL_PATH, model_name="", model_base=None | |
| ) | |
| # Fallback chat template (some checkpoints don’t ship one) | |
| if getattr(tokenizer, "chat_template", None) is None: | |
| tokenizer.chat_template = ( | |
| "{% for message in messages %}{{ message['role'] | upper }}: " | |
| "{{ message['content'] }}\n{% endfor %}ASSISTANT:" | |
| ) | |
| def vila_infer(image, prompt, max_new_tokens, temperature): | |
| if image is None: | |
| return "Please upload an image." | |
| if not prompt.strip(): | |
| prompt = "Please describe the image." | |
| # VILA expects a “conversation” with mixed media. | |
| # We pass both the image and the text. The model code will find the image | |
| # and insert media tokens automatically. | |
| # (Under the hood it looks for DEFAULT_IMAGE_TOKEN or a media dict.) | |
| pil = Image.fromarray(image).convert("RGB") | |
| # Minimal prompt: put the <image> token then your question | |
| user_prompt = f"{DEFAULT_IMAGE_TOKEN}\n{prompt}" | |
| # Let VILA handle preprocessing & generation | |
| out = model.generate_content( | |
| prompt=[{"from":"human","value":[{"type":"image","value":pil}, | |
| {"type":"text","value":prompt}]}], | |
| generation_config=None | |
| ) | |
| # Some versions return plain text; others return dicts. Normalize: | |
| return str(out) | |
| with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo: | |
| gr.Markdown("## 🖼️ VILA-1.5-3B Demo\nUpload an image and ask a question.") | |
| with gr.Row(): | |
| img = gr.Image(type="numpy", label="Image", height=320) | |
| prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2) | |
| with gr.Row(): | |
| max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens") | |
| temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature") | |
| btn = gr.Button("Run") | |
| out = gr.Textbox(label="Output", lines=8) | |
| btn.click(vila_infer, [img, prompt, max_new, temp], out) | |
| demo.launch() | |