Spaces:

DivyanshHF
/

VisionLLM

Runtime error

App Files Files Community

VisionLLM / app.py

DivyanshHF

Create app.py

24c4395 verified 4 months ago

raw

history blame

2.61 kB

	import os, io
	import gradio as gr
	from PIL import Image

	# Make runtime conservative (avoid native kernel issues on shared GPUs)
	os.environ.setdefault("FLASH_ATTENTION", "0")
	os.environ.setdefault("XFORMERS_DISABLED", "1")
	os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")

	# ---- VILA imports (from the repo installed via requirements.txt)
	from llava.model.builder import load_pretrained_model
	from llava.constants import DEFAULT_IMAGE_TOKEN

	# --- Load VILA-1.5-3B once
	MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"

	# Some builds need a non-None model_name; empty string is fine
	tokenizer, model, image_processor, context_len = load_pretrained_model(
	MODEL_PATH, model_name="", model_base=None
	)

	# Fallback chat template (some checkpoints don’t ship one)
	if getattr(tokenizer, "chat_template", None) is None:
	tokenizer.chat_template = (
	"{% for message in messages %}{{ message['role'] \| upper }}: "
	"{{ message['content'] }}\n{% endfor %}ASSISTANT:"
	)

	def vila_infer(image, prompt, max_new_tokens, temperature):
	if image is None:
	return "Please upload an image."
	if not prompt.strip():
	prompt = "Please describe the image."

	# VILA expects a “conversation” with mixed media.
	# We pass both the image and the text. The model code will find the image
	# and insert media tokens automatically.
	# (Under the hood it looks for DEFAULT_IMAGE_TOKEN or a media dict.)
	pil = Image.fromarray(image).convert("RGB")

	# Minimal prompt: put the <image> token then your question
	user_prompt = f"{DEFAULT_IMAGE_TOKEN}\n{prompt}"

	# Let VILA handle preprocessing & generation
	out = model.generate_content(
	prompt=[{"from":"human","value":[{"type":"image","value":pil},
	{"type":"text","value":prompt}]}],
	generation_config=None
	)
	# Some versions return plain text; others return dicts. Normalize:
	return str(out)

	with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo:
	gr.Markdown("## 🖼️ VILA-1.5-3B Demo\nUpload an image and ask a question.")
	with gr.Row():
	img = gr.Image(type="numpy", label="Image", height=320)
	prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
	with gr.Row():
	max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens")
	temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
	btn = gr.Button("Run")
	out = gr.Textbox(label="Output", lines=8)
	btn.click(vila_infer, [img, prompt, max_new, temp], out)

	demo.launch()