VLM / app.py
WaysAheadGlobal's picture
Create app.py
d78fda0 verified
raw
history blame
1.77 kB
# app.py
import streamlit as st
from PIL import Image
import torch
# Import TinyLLaVA modules (use local copy!)
from tinyllava.model.builder import load_pretrained_model
from tinyllava.utils import disable_torch_init
from tinyllava.mm_utils import (
process_images,
tokenizer_image_token,
get_model_name_from_path
)
# Disable torch default init for speed
disable_torch_init()
# Load TinyLLaVA 3.1B
MODEL_PATH = "bczhou/TinyLLaVA-3.1B"
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path=MODEL_PATH,
model_base=None,
model_name="TinyLLaVA-3.1B"
)
device = torch.device("cpu")
model.to(device)
# Streamlit UI
st.set_page_config(page_title="TinyLLaVA 3.1B (Streamlit)", layout="centered")
st.title("πŸ¦™ TinyLLaVA 3.1B β€” Vision-Language Q&A")
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
prompt = st.text_input("Ask a question about the image:")
if uploaded_file is not None and prompt:
image = Image.open(uploaded_file).convert("RGB")
# Process image
image_tensor = process_images([image], image_processor, model.config)
image_tensor = image_tensor.to(device)
# Process prompt
prompt_text = tokenizer_image_token(prompt, tokenizer, context_len)
inputs = tokenizer([prompt_text])
input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)
# Run inference
with st.spinner("Generating answer..."):
output_ids = model.generate(
input_ids,
images=image_tensor,
do_sample=True,
temperature=0.2,
max_new_tokens=200
)
out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
st.subheader("Answer:")
st.write(out_text)