DivyanshHF commited on
Commit
24c4395
·
verified ·
1 Parent(s): 62ed03f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, io
2
+ import gradio as gr
3
+ from PIL import Image
4
+
5
+ # Make runtime conservative (avoid native kernel issues on shared GPUs)
6
+ os.environ.setdefault("FLASH_ATTENTION", "0")
7
+ os.environ.setdefault("XFORMERS_DISABLED", "1")
8
+ os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")
9
+
10
+ # ---- VILA imports (from the repo installed via requirements.txt)
11
+ from llava.model.builder import load_pretrained_model
12
+ from llava.constants import DEFAULT_IMAGE_TOKEN
13
+
14
+ # --- Load VILA-1.5-3B once
15
+ MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
16
+
17
+ # Some builds need a non-None model_name; empty string is fine
18
+ tokenizer, model, image_processor, context_len = load_pretrained_model(
19
+ MODEL_PATH, model_name="", model_base=None
20
+ )
21
+
22
+ # Fallback chat template (some checkpoints don’t ship one)
23
+ if getattr(tokenizer, "chat_template", None) is None:
24
+ tokenizer.chat_template = (
25
+ "{% for message in messages %}{{ message['role'] | upper }}: "
26
+ "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
27
+ )
28
+
29
+ def vila_infer(image, prompt, max_new_tokens, temperature):
30
+ if image is None:
31
+ return "Please upload an image."
32
+ if not prompt.strip():
33
+ prompt = "Please describe the image."
34
+
35
+ # VILA expects a “conversation” with mixed media.
36
+ # We pass both the image and the text. The model code will find the image
37
+ # and insert media tokens automatically.
38
+ # (Under the hood it looks for DEFAULT_IMAGE_TOKEN or a media dict.)
39
+ pil = Image.fromarray(image).convert("RGB")
40
+
41
+ # Minimal prompt: put the <image> token then your question
42
+ user_prompt = f"{DEFAULT_IMAGE_TOKEN}\n{prompt}"
43
+
44
+ # Let VILA handle preprocessing & generation
45
+ out = model.generate_content(
46
+ prompt=[{"from":"human","value":[{"type":"image","value":pil},
47
+ {"type":"text","value":prompt}]}],
48
+ generation_config=None
49
+ )
50
+ # Some versions return plain text; others return dicts. Normalize:
51
+ return str(out)
52
+
53
+ with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo:
54
+ gr.Markdown("## 🖼️ VILA-1.5-3B Demo\nUpload an image and ask a question.")
55
+ with gr.Row():
56
+ img = gr.Image(type="numpy", label="Image", height=320)
57
+ prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
58
+ with gr.Row():
59
+ max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens")
60
+ temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
61
+ btn = gr.Button("Run")
62
+ out = gr.Textbox(label="Output", lines=8)
63
+ btn.click(vila_infer, [img, prompt, max_new, temp], out)
64
+
65
+ demo.launch()