DivyanshHF commited on
Commit
279f604
·
verified ·
1 Parent(s): 3f3111c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -24
app.py CHANGED
@@ -1,65 +1,74 @@
1
- import os, io
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
  from PIL import Image
4
 
5
- # Make runtime conservative (avoid native kernel issues on shared GPUs)
6
- os.environ.setdefault("FLASH_ATTENTION", "0")
7
- os.environ.setdefault("XFORMERS_DISABLED", "1")
8
- os.environ.setdefault("ACCELERATE_USE_DEVICE_MAP", "0")
9
-
10
- # ---- VILA imports (from the repo installed via requirements.txt)
11
  from llava.model.builder import load_pretrained_model
12
  from llava.constants import DEFAULT_IMAGE_TOKEN
13
 
14
- # --- Load VILA-1.5-3B once
15
  MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
16
-
17
- # Some builds need a non-None model_name; empty string is fine
18
  tokenizer, model, image_processor, context_len = load_pretrained_model(
19
  MODEL_PATH, model_name="", model_base=None
20
  )
21
 
22
- # Fallback chat template (some checkpoints don’t ship one)
23
  if getattr(tokenizer, "chat_template", None) is None:
24
  tokenizer.chat_template = (
25
  "{% for message in messages %}{{ message['role'] | upper }}: "
26
  "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
27
  )
28
 
 
29
  def vila_infer(image, prompt, max_new_tokens, temperature):
30
  if image is None:
31
- return "Please upload an image."
32
  if not prompt.strip():
33
  prompt = "Please describe the image."
34
 
35
- # VILA expects a “conversation” with mixed media.
36
- # We pass both the image and the text. The model code will find the image
37
- # and insert media tokens automatically.
38
- # (Under the hood it looks for DEFAULT_IMAGE_TOKEN or a media dict.)
39
  pil = Image.fromarray(image).convert("RGB")
40
 
41
- # Minimal prompt: put the <image> token then your question
42
- user_prompt = f"{DEFAULT_IMAGE_TOKEN}\n{prompt}"
 
 
 
 
 
 
43
 
44
- # Let VILA handle preprocessing & generation
45
  out = model.generate_content(
46
- prompt=[{"from":"human","value":[{"type":"image","value":pil},
47
- {"type":"text","value":prompt}]}],
48
  generation_config=None
49
  )
50
- # Some versions return plain text; others return dicts. Normalize:
51
  return str(out)
52
 
53
- with gr.Blocks(title="VILA 1.5 3B (HF Space)") as demo:
54
- gr.Markdown("## 🖼️ VILA-1.5-3B Demo\nUpload an image and ask a question.")
 
 
55
  with gr.Row():
56
  img = gr.Image(type="numpy", label="Image", height=320)
57
  prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
 
58
  with gr.Row():
59
  max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens")
60
  temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
 
61
  btn = gr.Button("Run")
62
  out = gr.Textbox(label="Output", lines=8)
 
63
  btn.click(vila_infer, [img, prompt, max_new, temp], out)
64
 
65
  demo.launch()
 
1
+ import os
2
+
3
+ # ===== Disable GPU-specific optional deps for Hugging Face Spaces =====
4
+ os.environ["FLASH_ATTENTION"] = "0"
5
+ os.environ["DISABLE_FLASH_ATTN"] = "1"
6
+ os.environ["XFORMERS_DISABLED"] = "1"
7
+ os.environ["ACCELERATE_USE_DEVICE_MAP"] = "0"
8
+
9
+ # Optional: force CPU if GPU not available
10
+ # os.environ["CUDA_VISIBLE_DEVICES"] = ""
11
+
12
  import gradio as gr
13
  from PIL import Image
14
 
15
+ # ---- VILA imports ----
 
 
 
 
 
16
  from llava.model.builder import load_pretrained_model
17
  from llava.constants import DEFAULT_IMAGE_TOKEN
18
 
19
+ # === Load VILA 1.5-3B ===
20
  MODEL_PATH = "Efficient-Large-Model/VILA1.5-3b"
 
 
21
  tokenizer, model, image_processor, context_len = load_pretrained_model(
22
  MODEL_PATH, model_name="", model_base=None
23
  )
24
 
25
+ # === Fallback chat template (in case checkpoint doesn't have one) ===
26
  if getattr(tokenizer, "chat_template", None) is None:
27
  tokenizer.chat_template = (
28
  "{% for message in messages %}{{ message['role'] | upper }}: "
29
  "{{ message['content'] }}\n{% endfor %}ASSISTANT:"
30
  )
31
 
32
+ # === Inference function ===
33
  def vila_infer(image, prompt, max_new_tokens, temperature):
34
  if image is None:
35
+ return "Please upload an image."
36
  if not prompt.strip():
37
  prompt = "Please describe the image."
38
 
 
 
 
 
39
  pil = Image.fromarray(image).convert("RGB")
40
 
41
+ # Prepare multimodal input for VILA
42
+ conversation = [{
43
+ "from": "human",
44
+ "value": [
45
+ {"type": "image", "value": pil},
46
+ {"type": "text", "value": prompt}
47
+ ]
48
+ }]
49
 
50
+ # Generate output
51
  out = model.generate_content(
52
+ prompt=conversation,
 
53
  generation_config=None
54
  )
 
55
  return str(out)
56
 
57
+ # === Gradio UI ===
58
+ with gr.Blocks(title="VILA 1.5 3B Demo") as demo:
59
+ gr.Markdown("## 🖼️ VILA-1.5-3B — Image Understanding Demo\nUpload an image and ask a question.")
60
+
61
  with gr.Row():
62
  img = gr.Image(type="numpy", label="Image", height=320)
63
  prompt = gr.Textbox(label="Prompt", value="Please describe the image", lines=2)
64
+
65
  with gr.Row():
66
  max_new = gr.Slider(16, 256, value=96, step=1, label="Max new tokens")
67
  temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
68
+
69
  btn = gr.Button("Run")
70
  out = gr.Textbox(label="Output", lines=8)
71
+
72
  btn.click(vila_infer, [img, prompt, max_new, temp], out)
73
 
74
  demo.launch()